import requests from bs4 import BeautifulSoup import json import re from urllib.parse import urljoin, urlparse import time from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, NoSuchElementException class GoustoScraper: def __init__(self, use_selenium=True): self.base_url = "https://www.gousto.co.uk" self.use_selenium = use_selenium if use_selenium: # Setup Selenium WebDriver with more robust options chrome_options = Options() chrome_options.add_argument('--headless=new') # Use new headless mode chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--disable-extensions') chrome_options.add_argument('--disable-background-timer-throttling') chrome_options.add_argument('--disable-renderer-backgrounding') chrome_options.add_argument('--disable-backgrounding-occluded-windows') chrome_options.add_argument('--disable-ipc-flooding-protection') chrome_options.add_argument('--window-size=1920,1080') chrome_options.add_argument('--remote-debugging-port=9222') chrome_options.add_argument('--disable-features=TranslateUI') chrome_options.add_argument('--disable-default-apps') chrome_options.add_argument('--disable-logging') chrome_options.add_argument('--disable-plugins') chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') # Set additional options for stability chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) chrome_options.add_experimental_option('useAutomationExtension', False) try: # Try different methods to initialize Chrome print("Attempting to initialize Chrome WebDriver...") # Method 1: Try with webdriver-manager (if available) try: from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service service = Service(ChromeDriverManager().install()) self.driver = webdriver.Chrome(service=service, options=chrome_options) print("✓ Selenium WebDriver initialized with webdriver-manager") except ImportError: print("webdriver-manager not available, trying system Chrome...") # Method 2: Try system Chrome self.driver = webdriver.Chrome(options=chrome_options) print("✓ Selenium WebDriver initialized with system Chrome") except Exception as e: print(f"Failed with system Chrome: {e}") # Method 3: Try with explicit Chrome binary path chrome_options.binary_location = "/usr/bin/google-chrome" # Common Linux path self.driver = webdriver.Chrome(options=chrome_options) print("✓ Selenium WebDriver initialized with explicit binary path") self.wait = WebDriverWait(self.driver, 10) # Test the driver self.driver.get("https://www.google.com") print("✓ WebDriver test successful") except Exception as e: print(f"✗ All Selenium initialization methods failed: {e}") print("Common solutions:") print("1. Install Chrome browser: sudo apt-get install google-chrome-stable") print("2. Install webdriver-manager: pip install webdriver-manager") print("3. Download ChromeDriver manually from https://chromedriver.chromium.org/") print("4. Make sure Chrome and ChromeDriver versions match") print("\nFalling back to requests method...") self.use_selenium = False self._init_requests_session() else: self._init_requests_session() def _init_requests_session(self): """Initialize requests session as fallback""" self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', }) def get_page_content_selenium(self, url): """Get page content using Selenium (handles JavaScript)""" try: print(f"Loading page with Selenium: {url}") self.driver.get(url) # Wait for recipe cards to load try: # Wait for recipe elements to be present self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "a"))) # Scroll down to trigger lazy loading self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(2) # Scroll back up self.driver.execute_script("window.scrollTo(0, 0);") time.sleep(1) except TimeoutException: print("Timeout waiting for page elements to load") return self.driver.page_source except Exception as e: print(f"Error loading page with Selenium: {e}") return None def get_page_content_requests(self, url): """Get page content using requests (static HTML only)""" try: response = self.session.get(url) response.raise_for_status() return response.text except requests.RequestException as e: print(f"Error fetching page with requests: {e}") return None def get_recipe_links(self, page_url): """Extract recipe links from the cookbook page""" # Get page content if self.use_selenium: html_content = self.get_page_content_selenium(page_url) else: html_content = self.get_page_content_requests(page_url) if not html_content: return [] soup = BeautifulSoup(html_content, 'html.parser') # Debug: Print some of the HTML to see what we're working with print("Analyzing page structure...") print(f"Page title: {soup.title.string if soup.title else 'No title found'}") # More comprehensive selectors for Gousto recipe_links = set() # Use set to avoid duplicates # Different selector strategies selectors = [ # Try different link patterns 'a[href*="/cookbook/recipes/"]:not([href*="?page="])', 'a[href^="/cookbook/recipes/"]:not([href*="?"])', '[data-test-id*="recipe"] a', '[data-testid*="recipe"] a', '.recipe-card a', '.recipe-tile a', '.recipe a', 'article a[href*="/cookbook/recipes/"]', 'div[class*="recipe"] a', # More generic approaches 'a[href*="/recipes/"]:not([href*="?page="])', ] for selector in selectors: try: links = soup.select(selector) print(f"Selector '{selector}' found {len(links)} links") for link in links: href = link.get('href') if href: # Clean and validate the URL if href.startswith('/'): full_url = urljoin(self.base_url, href) else: full_url = href # Filter out pagination and non-recipe links if ('/cookbook/recipes/' in full_url and '?page=' not in full_url and full_url != page_url): recipe_links.add(full_url) if recipe_links: break # If we found links with this selector, stop trying others except Exception as e: print(f"Error with selector '{selector}': {e}") continue recipe_links = list(recipe_links) print(f"Total unique recipe links found: {len(recipe_links)}") # Debug: Print first few links for i, link in enumerate(recipe_links[:3]): print(f"Sample link {i+1}: {link}") return recipe_links def filter_steps(self, raw_steps, recipe_title=''): """Filter out unwanted items from steps array (breadcrumbs, navigation, etc.)""" if not raw_steps: return [] filtered_steps = [] # Common unwanted step content to filter out unwanted_patterns = [ 'cookbook', 'recipes', 'home', 'menu', 'navigation', 'skip to', 'back to', 'print recipe', 'save recipe', 'rate this recipe', 'share', 'pinterest', 'facebook', 'twitter', 'instagram' ] # Convert recipe title to lowercase for comparison recipe_title_lower = recipe_title.lower() if recipe_title else '' for i, step in enumerate(raw_steps): step_lower = step.lower().strip() # Skip empty steps if not step_lower: continue # Skip if it's exactly "Cookbook" if step_lower == 'cookbook': continue # Skip if it matches the recipe title exactly if recipe_title_lower and step_lower == recipe_title_lower: continue # Skip if it's very short and likely navigation (less than 10 chars) if len(step) < 10 and any(pattern in step_lower for pattern in unwanted_patterns): continue # Skip if it contains unwanted patterns and is short (likely navigation) if len(step) < 50 and any(pattern in step_lower for pattern in unwanted_patterns): continue # Skip steps that are just numbers (step numbers without content) if step.strip().isdigit(): continue # Skip if it's just a single word that's likely navigation if len(step.split()) == 1 and len(step) < 15: # But keep single words that are likely cooking instructions cooking_words = ['preheat', 'heat', 'boil', 'simmer', 'bake', 'fry', 'mix', 'stir', 'serve'] if not any(word in step_lower for word in cooking_words): continue # Skip if it looks like a breadcrumb pattern (word > word > word) if '>' in step or '»' in step or ('/' in step and len(step.split()) <= 4): continue # If we've made it this far, it's likely a real cooking step filtered_steps.append(step.strip()) # Additional filtering: remove the first few items if they still look like breadcrumbs # This handles cases where the first 1-2 items are navigation that slipped through if len(filtered_steps) >= 2: # Check if first item is very short and generic first_item = filtered_steps[0].lower() if (len(first_item) < 20 and any(pattern in first_item for pattern in ['cookbook', 'recipe', 'home', 'menu'])): filtered_steps = filtered_steps[1:] # Check if second item is the recipe title or similar if (len(filtered_steps) >= 2 and recipe_title_lower and filtered_steps[0].lower() == recipe_title_lower): filtered_steps = filtered_steps[1:] return filtered_steps def parse_ingredient(self, ingredient_text): """Parse ingredient text into structured format with name, amount, and unit""" ingredient_text = ingredient_text.strip() # Common units and their variations units = { # Volume 'ml', 'millilitre', 'milliliters', 'millilitres', 'l', 'litre', 'liter', 'litres', 'liters', 'cup', 'cups', 'c', 'tbsp', 'tablespoon', 'tablespoons', 'tbs', 'tb', 'tsp', 'teaspoon', 'teaspoons', 'ts', 'fl oz', 'fluid ounce', 'fluid ounces', 'pint', 'pints', 'pt', # Weight 'g', 'gram', 'grams', 'gr', 'kg', 'kilogram', 'kilograms', 'oz', 'ounce', 'ounces', 'lb', 'pound', 'pounds', 'lbs', # Count/pieces 'piece', 'pieces', 'pc', 'pcs', 'slice', 'slices', 'clove', 'cloves', 'bunch', 'bunches', 'handful', 'handfuls', 'pinch', 'pinches', 'dash', 'dashes', 'sprig', 'sprigs', 'stick', 'sticks', 'can', 'cans', 'tin', 'tins', 'jar', 'jars', 'packet', 'packets', 'pack', 'packs', 'box', 'boxes', 'bag', 'bags', # Special measurements 'to taste', 'as needed', 'optional' } # Create a pattern to match numbers (including fractions and decimals) number_pattern = r'\d+(?:[/.]\d+)?(?:\.\d+)?|\d*\.\d+' # Try to extract amount and unit ingredient = { 'name': ingredient_text, 'amount': '', 'unit': '' } # Pattern 1: "2 cups flour" or "250g butter" pattern1 = r'^(' + number_pattern + r')\s*([a-zA-Z\s]+?)\s+(.+)$' match1 = re.match(pattern1, ingredient_text, re.IGNORECASE) if match1: amount = match1.group(1) potential_unit = match1.group(2).strip().lower() name = match1.group(3).strip() # Check if the potential unit is in our units list if any(unit in potential_unit for unit in units): ingredient['amount'] = amount ingredient['unit'] = match1.group(2).strip() # Keep original case ingredient['name'] = name return ingredient # Pattern 2: "flour - 2 cups" or "butter (250g)" pattern2 = r'^(.+?)[\s\-\(]+(' + number_pattern + r')\s*([a-zA-Z\s]*?)[\)\s]*$' match2 = re.match(pattern2, ingredient_text, re.IGNORECASE) if match2: name = match2.group(1).strip() amount = match2.group(2) potential_unit = match2.group(3).strip().lower() if any(unit in potential_unit for unit in units) or not potential_unit: ingredient['name'] = name ingredient['amount'] = amount ingredient['unit'] = match2.group(3).strip() # Keep original case return ingredient # Pattern 3: Just amount at start "2 onions" or "1 large egg" pattern3 = r'^(' + number_pattern + r')\s+(.+)$' match3 = re.match(pattern3, ingredient_text, re.IGNORECASE) if match3: amount = match3.group(1) rest = match3.group(2).strip() # Check if it starts with a unit words = rest.split() if words and words[0].lower() in units: ingredient['amount'] = amount ingredient['unit'] = words[0] ingredient['name'] = ' '.join(words[1:]) if len(words) > 1 else rest return ingredient else: # No explicit unit, treat as pieces/items ingredient['amount'] = amount ingredient['unit'] = '' ingredient['name'] = rest return ingredient # Pattern 4: Fractional amounts "1/2 cup sugar" fraction_pattern = r'^(\d+/\d+)\s+([a-zA-Z\s]+?)\s+(.+)$' match4 = re.match(fraction_pattern, ingredient_text, re.IGNORECASE) if match4: amount = match4.group(1) potential_unit = match4.group(2).strip().lower() name = match4.group(3).strip() if any(unit in potential_unit for unit in units): ingredient['amount'] = amount ingredient['unit'] = match4.group(2).strip() ingredient['name'] = name return ingredient # If no patterns match, return as-is (some ingredients might not have amounts) return ingredient def scrape_recipe_details(self, recipe_url): """Scrape detailed recipe information from individual recipe page""" try: if self.use_selenium: html_content = self.get_page_content_selenium(recipe_url) else: response = self.session.get(recipe_url) response.raise_for_status() html_content = response.text if not html_content: return None soup = BeautifulSoup(html_content, 'html.parser') recipe_data = { 'url': recipe_url, 'title': '', 'image_url': '', 'prep_time': '', 'serving': '', 'ingredients': [], 'description': '', 'steps': [] } # Extract title with more selectors title_selectors = [ 'h1', '[data-test-id*="title"]', '[data-testid*="title"]', '.recipe-title', '.title', 'title' ] for selector in title_selectors: title_elem = soup.select_one(selector) if title_elem and title_elem.get_text(strip=True): recipe_data['title'] = title_elem.get_text(strip=True) break # Extract image URL with more selectors img_selectors = [ 'img[src*="recipe"]', 'img[alt*="recipe"]', '.recipe-image img', '.hero-image img', 'img[src*="gousto"]', 'main img', 'img[src*="cloudinary"]' # Common CDN for food images ] for selector in img_selectors: img_elem = soup.select_one(selector) if img_elem: src = img_elem.get('src') or img_elem.get('data-src') or img_elem.get('data-lazy') if src and ('recipe' in src.lower() or 'gousto' in src.lower()): if src.startswith('//'): src = 'https:' + src elif src.startswith('/'): src = urljoin(self.base_url, src) recipe_data['image_url'] = src break # Extract prep time time_patterns = [ r'(\d+)\s*min', r'(\d+)\s*hour', r'(\d+)\s*hr' ] time_text = soup.get_text() for pattern in time_patterns: match = re.search(pattern, time_text, re.IGNORECASE) if match: recipe_data['prep_time'] = match.group(0) break # Extract serving information serving_pattern = r'(\d+)\s*(?:serve|serving|portion|people)' serving_match = re.search(serving_pattern, soup.get_text(), re.IGNORECASE) if serving_match: recipe_data['serving'] = serving_match.group(0) # Extract ingredients and parse them into structured format ingredient_selectors = [ '.ingredient', '.ingredients li', '[data-test-id*="ingredient"]', 'ul[class*="ingredient"] li', '.recipe-ingredients li' ] for selector in ingredient_selectors: ingredients = soup.select(selector) if ingredients: raw_ingredients = [ing.get_text(strip=True) for ing in ingredients if ing.get_text(strip=True)] # Parse each ingredient into structured format recipe_data['ingredients'] = [self.parse_ingredient(ing) for ing in raw_ingredients] break # Extract description desc_selectors = [ '.description', '.recipe-description', 'meta[name="description"]', '.intro', 'p' ] for selector in desc_selectors: if selector == 'meta[name="description"]': desc_elem = soup.select_one(selector) if desc_elem: recipe_data['description'] = desc_elem.get('content', '').strip() if recipe_data['description']: break else: desc_elem = soup.select_one(selector) if desc_elem and len(desc_elem.get_text(strip=True)) > 20: recipe_data['description'] = desc_elem.get_text(strip=True) break # Extract steps step_selectors = [ '.step', '.instruction', '.method li', '.directions li', 'ol li', '[data-test-id*="step"]' ] for selector in step_selectors: steps = soup.select(selector) if steps: raw_steps = [step.get_text(strip=True) for step in steps if step.get_text(strip=True)] # Filter out unwanted steps (breadcrumbs, navigation, etc.) recipe_data['steps'] = self.filter_steps(raw_steps, recipe_data.get('title', '')) break return recipe_data except Exception as e: print(f"Error scraping recipe {recipe_url}: {e}") return None def scrape_recipes(self, start_url, max_recipes=10): """Main method to scrape recipes""" print(f"Starting to scrape recipes from: {start_url}") print(f"Using {'Selenium' if self.use_selenium else 'Requests'} for web scraping") # Get recipe links from the main page recipe_links = self.get_recipe_links(start_url) if not recipe_links: print("No recipe links found. Possible reasons:") print("1. Website structure has changed") print("2. Content is loaded dynamically and requires JavaScript") print("3. Website is blocking automated requests") return [] # Limit the number of recipes to scrape recipe_links = recipe_links[:max_recipes] recipes = [] for i, recipe_url in enumerate(recipe_links, 1): print(f"\nScraping recipe {i}/{len(recipe_links)}: {recipe_url}") recipe_data = self.scrape_recipe_details(recipe_url) if recipe_data and recipe_data['title']: recipes.append(recipe_data) print(f"✓ Successfully scraped: {recipe_data['title']}") else: print(f"✗ Failed to scrape recipe: {recipe_url}") # Be respectful - add delay between requests time.sleep(2) return recipes def __del__(self): """Clean up Selenium driver""" if hasattr(self, 'driver'): try: self.driver.quit() except: pass def main(): # Option 1: Try with Selenium first (handles JavaScript), fallback to requests print("=== Gousto Recipe Scraper ===") print("Choose scraping method:") print("1. Selenium (recommended - handles JavaScript)") print("2. Requests only (faster, but may miss content)") try: choice = input("Enter choice (1 or 2, default=1): ").strip() if choice == '2': use_selenium = False print("Using requests-only method...") else: use_selenium = True print("Attempting to use Selenium...") except (EOFError, KeyboardInterrupt): # Handle case where input() isn't available (like in some environments) use_selenium = True print("Defaulting to Selenium method...") scraper = GoustoScraper(use_selenium=use_selenium) try: # URL to scrape start_url = "https://www.gousto.co.uk/cookbook/recipes?page=1" # Scrape recipes (limit to 5 for testing) recipes = scraper.scrape_recipes(start_url, max_recipes=5) # Save to JSON file if recipes: with open('gousto_recipes.json', 'w', encoding='utf-8') as f: json.dump(recipes, f, indent=2, ensure_ascii=False) print(f"\n🎉 Successfully scraped {len(recipes)} recipes!") print("Data saved to 'gousto_recipes.json'") # Print sample data with structured ingredients if recipes: print("\nSample recipe data:") sample = recipes[0] print(f"Title: {sample['title']}") print(f"Image: {sample['image_url']}") print(f"Prep Time: {sample['prep_time']}") print(f"Serving: {sample['serving']}") print(f"Ingredients: {len(sample['ingredients'])} items") # Show first few ingredients in structured format if sample['ingredients']: print("Sample ingredients:") for i, ing in enumerate(sample['ingredients'][:3]): print(f" {i+1}. {ing['amount']} {ing['unit']} {ing['name']}".strip()) print(f"Steps: {len(sample['steps'])} steps") else: print("\n❌ No recipes were successfully scraped.") print("This might be due to:") print("- Website changes") print("- Anti-bot measures") print("- Network issues") finally: # Clean up del scraper if __name__ == "__main__": main()