Init commit with working script

This commit is contained in:
2025-08-17 11:44:25 +01:00
commit e570dfe1dc
4 changed files with 807 additions and 0 deletions

56
.gitignore vendored Normal file
View File

@@ -0,0 +1,56 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# Virtual Environment
venv/
ENV/
env/
# IDE
.idea/
.vscode/
*.swp
*.swo
*~
# Project specific
gousto_recipes.json
.DS_Store
# Logs and databases
*.log
*.sqlite3
# Environment variables
.env
.env.local
.env.*.local
# Jupyter Notebook
.ipynb_checkpoints
# Local development settings
local_settings.py
# Selenium
*.crx
*.pem

73
README.md Normal file
View File

@@ -0,0 +1,73 @@
# Gousto Recipe Scraper
A Python script to scrape recipe data from Gousto's website and save it to a JSON file.
## Prerequisites
- Python 3.7+
- Chrome or Chromium browser (for Selenium)
- ChromeDriver (will be installed automatically by webdriver-manager)
## Setup
1. Clone this repository:
```bash
git clone <repository-url>
cd gousto-scraper
```
2. Create and activate a virtual environment:
```bash
# On Linux/MacOS
python3 -m venv venv
source venv/bin/activate
# On Windows
python -m venv venv
.\venv\Scripts\activate
```
3. Install the required packages:
```bash
pip install -r requirements.txt
```
## Usage
Run the scraper with the following command:
```bash
python scraper.py
```
This will:
1. Scrape recipe data from Gousto's website
2. Save the results to `gousto_recipes.json`
### Options
- `--use-selenium` (default: True): Use Selenium for JavaScript rendering
- `--headless` (default: True): Run browser in headless mode
- `--max-pages`: Maximum number of recipe pages to scrape (default: all)
- `--output`: Output JSON file path (default: gousto_recipes.json)
Example:
```bash
python scraper.py --max-pages 5 --output recipes.json
```
## Output
The script saves the scraped data to a JSON file containing an array of recipe objects, each including:
- Title
- Description
- Ingredients
- Cooking time
- Nutritional information
- And more
## Notes
- This script is for educational purposes only
- Be respectful of Gousto's website - don't make too many requests in a short period
- The website structure might change over time, which could break the scraper

4
requirements.txt Normal file
View File

@@ -0,0 +1,4 @@
selenium==4.15.2
webdriver-manager==4.0.1
beautifulsoup4==4.12.2
lxml==4.9.3

674
scraper.py Normal file
View File

@@ -0,0 +1,674 @@
import requests
from bs4 import BeautifulSoup
import json
import re
from urllib.parse import urljoin, urlparse
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
class GoustoScraper:
def __init__(self, use_selenium=True):
self.base_url = "https://www.gousto.co.uk"
self.use_selenium = use_selenium
if use_selenium:
# Setup Selenium WebDriver with more robust options
chrome_options = Options()
chrome_options.add_argument('--headless=new') # Use new headless mode
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-background-timer-throttling')
chrome_options.add_argument('--disable-renderer-backgrounding')
chrome_options.add_argument('--disable-backgrounding-occluded-windows')
chrome_options.add_argument('--disable-ipc-flooding-protection')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument('--remote-debugging-port=9222')
chrome_options.add_argument('--disable-features=TranslateUI')
chrome_options.add_argument('--disable-default-apps')
chrome_options.add_argument('--disable-logging')
chrome_options.add_argument('--disable-plugins')
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# Set additional options for stability
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
try:
# Try different methods to initialize Chrome
print("Attempting to initialize Chrome WebDriver...")
# Method 1: Try with webdriver-manager (if available)
try:
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
service = Service(ChromeDriverManager().install())
self.driver = webdriver.Chrome(service=service, options=chrome_options)
print("✓ Selenium WebDriver initialized with webdriver-manager")
except ImportError:
print("webdriver-manager not available, trying system Chrome...")
# Method 2: Try system Chrome
self.driver = webdriver.Chrome(options=chrome_options)
print("✓ Selenium WebDriver initialized with system Chrome")
except Exception as e:
print(f"Failed with system Chrome: {e}")
# Method 3: Try with explicit Chrome binary path
chrome_options.binary_location = "/usr/bin/google-chrome" # Common Linux path
self.driver = webdriver.Chrome(options=chrome_options)
print("✓ Selenium WebDriver initialized with explicit binary path")
self.wait = WebDriverWait(self.driver, 10)
# Test the driver
self.driver.get("https://www.google.com")
print("✓ WebDriver test successful")
except Exception as e:
print(f"✗ All Selenium initialization methods failed: {e}")
print("Common solutions:")
print("1. Install Chrome browser: sudo apt-get install google-chrome-stable")
print("2. Install webdriver-manager: pip install webdriver-manager")
print("3. Download ChromeDriver manually from https://chromedriver.chromium.org/")
print("4. Make sure Chrome and ChromeDriver versions match")
print("\nFalling back to requests method...")
self.use_selenium = False
self._init_requests_session()
else:
self._init_requests_session()
def _init_requests_session(self):
"""Initialize requests session as fallback"""
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
})
def get_page_content_selenium(self, url):
"""Get page content using Selenium (handles JavaScript)"""
try:
print(f"Loading page with Selenium: {url}")
self.driver.get(url)
# Wait for recipe cards to load
try:
# Wait for recipe elements to be present
self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "a")))
# Scroll down to trigger lazy loading
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# Scroll back up
self.driver.execute_script("window.scrollTo(0, 0);")
time.sleep(1)
except TimeoutException:
print("Timeout waiting for page elements to load")
return self.driver.page_source
except Exception as e:
print(f"Error loading page with Selenium: {e}")
return None
def get_page_content_requests(self, url):
"""Get page content using requests (static HTML only)"""
try:
response = self.session.get(url)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching page with requests: {e}")
return None
def get_recipe_links(self, page_url):
"""Extract recipe links from the cookbook page"""
# Get page content
if self.use_selenium:
html_content = self.get_page_content_selenium(page_url)
else:
html_content = self.get_page_content_requests(page_url)
if not html_content:
return []
soup = BeautifulSoup(html_content, 'html.parser')
# Debug: Print some of the HTML to see what we're working with
print("Analyzing page structure...")
print(f"Page title: {soup.title.string if soup.title else 'No title found'}")
# More comprehensive selectors for Gousto
recipe_links = set() # Use set to avoid duplicates
# Different selector strategies
selectors = [
# Try different link patterns
'a[href*="/cookbook/recipes/"]:not([href*="?page="])',
'a[href^="/cookbook/recipes/"]:not([href*="?"])',
'[data-test-id*="recipe"] a',
'[data-testid*="recipe"] a',
'.recipe-card a',
'.recipe-tile a',
'.recipe a',
'article a[href*="/cookbook/recipes/"]',
'div[class*="recipe"] a',
# More generic approaches
'a[href*="/recipes/"]:not([href*="?page="])',
]
for selector in selectors:
try:
links = soup.select(selector)
print(f"Selector '{selector}' found {len(links)} links")
for link in links:
href = link.get('href')
if href:
# Clean and validate the URL
if href.startswith('/'):
full_url = urljoin(self.base_url, href)
else:
full_url = href
# Filter out pagination and non-recipe links
if ('/cookbook/recipes/' in full_url and
'?page=' not in full_url and
full_url != page_url):
recipe_links.add(full_url)
if recipe_links:
break # If we found links with this selector, stop trying others
except Exception as e:
print(f"Error with selector '{selector}': {e}")
continue
recipe_links = list(recipe_links)
print(f"Total unique recipe links found: {len(recipe_links)}")
# Debug: Print first few links
for i, link in enumerate(recipe_links[:3]):
print(f"Sample link {i+1}: {link}")
return recipe_links
def filter_steps(self, raw_steps, recipe_title=''):
"""Filter out unwanted items from steps array (breadcrumbs, navigation, etc.)"""
if not raw_steps:
return []
filtered_steps = []
# Common unwanted step content to filter out
unwanted_patterns = [
'cookbook',
'recipes',
'home',
'menu',
'navigation',
'skip to',
'back to',
'print recipe',
'save recipe',
'rate this recipe',
'share',
'pinterest',
'facebook',
'twitter',
'instagram'
]
# Convert recipe title to lowercase for comparison
recipe_title_lower = recipe_title.lower() if recipe_title else ''
for i, step in enumerate(raw_steps):
step_lower = step.lower().strip()
# Skip empty steps
if not step_lower:
continue
# Skip if it's exactly "Cookbook"
if step_lower == 'cookbook':
continue
# Skip if it matches the recipe title exactly
if recipe_title_lower and step_lower == recipe_title_lower:
continue
# Skip if it's very short and likely navigation (less than 10 chars)
if len(step) < 10 and any(pattern in step_lower for pattern in unwanted_patterns):
continue
# Skip if it contains unwanted patterns and is short (likely navigation)
if len(step) < 50 and any(pattern in step_lower for pattern in unwanted_patterns):
continue
# Skip steps that are just numbers (step numbers without content)
if step.strip().isdigit():
continue
# Skip if it's just a single word that's likely navigation
if len(step.split()) == 1 and len(step) < 15:
# But keep single words that are likely cooking instructions
cooking_words = ['preheat', 'heat', 'boil', 'simmer', 'bake', 'fry', 'mix', 'stir', 'serve']
if not any(word in step_lower for word in cooking_words):
continue
# Skip if it looks like a breadcrumb pattern (word > word > word)
if '>' in step or '»' in step or ('/' in step and len(step.split()) <= 4):
continue
# If we've made it this far, it's likely a real cooking step
filtered_steps.append(step.strip())
# Additional filtering: remove the first few items if they still look like breadcrumbs
# This handles cases where the first 1-2 items are navigation that slipped through
if len(filtered_steps) >= 2:
# Check if first item is very short and generic
first_item = filtered_steps[0].lower()
if (len(first_item) < 20 and
any(pattern in first_item for pattern in ['cookbook', 'recipe', 'home', 'menu'])):
filtered_steps = filtered_steps[1:]
# Check if second item is the recipe title or similar
if (len(filtered_steps) >= 2 and recipe_title_lower and
filtered_steps[0].lower() == recipe_title_lower):
filtered_steps = filtered_steps[1:]
return filtered_steps
def parse_ingredient(self, ingredient_text):
"""Parse ingredient text into structured format with name, amount, and unit"""
ingredient_text = ingredient_text.strip()
# Common units and their variations
units = {
# Volume
'ml', 'millilitre', 'milliliters', 'millilitres',
'l', 'litre', 'liter', 'litres', 'liters',
'cup', 'cups', 'c',
'tbsp', 'tablespoon', 'tablespoons', 'tbs', 'tb',
'tsp', 'teaspoon', 'teaspoons', 'ts',
'fl oz', 'fluid ounce', 'fluid ounces',
'pint', 'pints', 'pt',
# Weight
'g', 'gram', 'grams', 'gr',
'kg', 'kilogram', 'kilograms',
'oz', 'ounce', 'ounces',
'lb', 'pound', 'pounds', 'lbs',
# Count/pieces
'piece', 'pieces', 'pc', 'pcs',
'slice', 'slices',
'clove', 'cloves',
'bunch', 'bunches',
'handful', 'handfuls',
'pinch', 'pinches',
'dash', 'dashes',
'sprig', 'sprigs',
'stick', 'sticks',
'can', 'cans', 'tin', 'tins',
'jar', 'jars',
'packet', 'packets', 'pack', 'packs',
'box', 'boxes',
'bag', 'bags',
# Special measurements
'to taste', 'as needed', 'optional'
}
# Create a pattern to match numbers (including fractions and decimals)
number_pattern = r'\d+(?:[/.]\d+)?(?:\.\d+)?|\d*\.\d+'
# Try to extract amount and unit
ingredient = {
'name': ingredient_text,
'amount': '',
'unit': ''
}
# Pattern 1: "2 cups flour" or "250g butter"
pattern1 = r'^(' + number_pattern + r')\s*([a-zA-Z\s]+?)\s+(.+)$'
match1 = re.match(pattern1, ingredient_text, re.IGNORECASE)
if match1:
amount = match1.group(1)
potential_unit = match1.group(2).strip().lower()
name = match1.group(3).strip()
# Check if the potential unit is in our units list
if any(unit in potential_unit for unit in units):
ingredient['amount'] = amount
ingredient['unit'] = match1.group(2).strip() # Keep original case
ingredient['name'] = name
return ingredient
# Pattern 2: "flour - 2 cups" or "butter (250g)"
pattern2 = r'^(.+?)[\s\-\(]+(' + number_pattern + r')\s*([a-zA-Z\s]*?)[\)\s]*$'
match2 = re.match(pattern2, ingredient_text, re.IGNORECASE)
if match2:
name = match2.group(1).strip()
amount = match2.group(2)
potential_unit = match2.group(3).strip().lower()
if any(unit in potential_unit for unit in units) or not potential_unit:
ingredient['name'] = name
ingredient['amount'] = amount
ingredient['unit'] = match2.group(3).strip() # Keep original case
return ingredient
# Pattern 3: Just amount at start "2 onions" or "1 large egg"
pattern3 = r'^(' + number_pattern + r')\s+(.+)$'
match3 = re.match(pattern3, ingredient_text, re.IGNORECASE)
if match3:
amount = match3.group(1)
rest = match3.group(2).strip()
# Check if it starts with a unit
words = rest.split()
if words and words[0].lower() in units:
ingredient['amount'] = amount
ingredient['unit'] = words[0]
ingredient['name'] = ' '.join(words[1:]) if len(words) > 1 else rest
return ingredient
else:
# No explicit unit, treat as pieces/items
ingredient['amount'] = amount
ingredient['unit'] = ''
ingredient['name'] = rest
return ingredient
# Pattern 4: Fractional amounts "1/2 cup sugar"
fraction_pattern = r'^(\d+/\d+)\s+([a-zA-Z\s]+?)\s+(.+)$'
match4 = re.match(fraction_pattern, ingredient_text, re.IGNORECASE)
if match4:
amount = match4.group(1)
potential_unit = match4.group(2).strip().lower()
name = match4.group(3).strip()
if any(unit in potential_unit for unit in units):
ingredient['amount'] = amount
ingredient['unit'] = match4.group(2).strip()
ingredient['name'] = name
return ingredient
# If no patterns match, return as-is (some ingredients might not have amounts)
return ingredient
def scrape_recipe_details(self, recipe_url):
"""Scrape detailed recipe information from individual recipe page"""
try:
if self.use_selenium:
html_content = self.get_page_content_selenium(recipe_url)
else:
response = self.session.get(recipe_url)
response.raise_for_status()
html_content = response.text
if not html_content:
return None
soup = BeautifulSoup(html_content, 'html.parser')
recipe_data = {
'url': recipe_url,
'title': '',
'image_url': '',
'prep_time': '',
'serving': '',
'ingredients': [],
'description': '',
'steps': []
}
# Extract title with more selectors
title_selectors = [
'h1',
'[data-test-id*="title"]',
'[data-testid*="title"]',
'.recipe-title',
'.title',
'title'
]
for selector in title_selectors:
title_elem = soup.select_one(selector)
if title_elem and title_elem.get_text(strip=True):
recipe_data['title'] = title_elem.get_text(strip=True)
break
# Extract image URL with more selectors
img_selectors = [
'img[src*="recipe"]',
'img[alt*="recipe"]',
'.recipe-image img',
'.hero-image img',
'img[src*="gousto"]',
'main img',
'img[src*="cloudinary"]' # Common CDN for food images
]
for selector in img_selectors:
img_elem = soup.select_one(selector)
if img_elem:
src = img_elem.get('src') or img_elem.get('data-src') or img_elem.get('data-lazy')
if src and ('recipe' in src.lower() or 'gousto' in src.lower()):
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = urljoin(self.base_url, src)
recipe_data['image_url'] = src
break
# Extract prep time
time_patterns = [
r'(\d+)\s*min',
r'(\d+)\s*hour',
r'(\d+)\s*hr'
]
time_text = soup.get_text()
for pattern in time_patterns:
match = re.search(pattern, time_text, re.IGNORECASE)
if match:
recipe_data['prep_time'] = match.group(0)
break
# Extract serving information
serving_pattern = r'(\d+)\s*(?:serve|serving|portion|people)'
serving_match = re.search(serving_pattern, soup.get_text(), re.IGNORECASE)
if serving_match:
recipe_data['serving'] = serving_match.group(0)
# Extract ingredients and parse them into structured format
ingredient_selectors = [
'.ingredient',
'.ingredients li',
'[data-test-id*="ingredient"]',
'ul[class*="ingredient"] li',
'.recipe-ingredients li'
]
for selector in ingredient_selectors:
ingredients = soup.select(selector)
if ingredients:
raw_ingredients = [ing.get_text(strip=True) for ing in ingredients if ing.get_text(strip=True)]
# Parse each ingredient into structured format
recipe_data['ingredients'] = [self.parse_ingredient(ing) for ing in raw_ingredients]
break
# Extract description
desc_selectors = [
'.description',
'.recipe-description',
'meta[name="description"]',
'.intro',
'p'
]
for selector in desc_selectors:
if selector == 'meta[name="description"]':
desc_elem = soup.select_one(selector)
if desc_elem:
recipe_data['description'] = desc_elem.get('content', '').strip()
if recipe_data['description']:
break
else:
desc_elem = soup.select_one(selector)
if desc_elem and len(desc_elem.get_text(strip=True)) > 20:
recipe_data['description'] = desc_elem.get_text(strip=True)
break
# Extract steps
step_selectors = [
'.step',
'.instruction',
'.method li',
'.directions li',
'ol li',
'[data-test-id*="step"]'
]
for selector in step_selectors:
steps = soup.select(selector)
if steps:
raw_steps = [step.get_text(strip=True) for step in steps if step.get_text(strip=True)]
# Filter out unwanted steps (breadcrumbs, navigation, etc.)
recipe_data['steps'] = self.filter_steps(raw_steps, recipe_data.get('title', ''))
break
return recipe_data
except Exception as e:
print(f"Error scraping recipe {recipe_url}: {e}")
return None
def scrape_recipes(self, start_url, max_recipes=10):
"""Main method to scrape recipes"""
print(f"Starting to scrape recipes from: {start_url}")
print(f"Using {'Selenium' if self.use_selenium else 'Requests'} for web scraping")
# Get recipe links from the main page
recipe_links = self.get_recipe_links(start_url)
if not recipe_links:
print("No recipe links found. Possible reasons:")
print("1. Website structure has changed")
print("2. Content is loaded dynamically and requires JavaScript")
print("3. Website is blocking automated requests")
return []
# Limit the number of recipes to scrape
recipe_links = recipe_links[:max_recipes]
recipes = []
for i, recipe_url in enumerate(recipe_links, 1):
print(f"\nScraping recipe {i}/{len(recipe_links)}: {recipe_url}")
recipe_data = self.scrape_recipe_details(recipe_url)
if recipe_data and recipe_data['title']:
recipes.append(recipe_data)
print(f"✓ Successfully scraped: {recipe_data['title']}")
else:
print(f"✗ Failed to scrape recipe: {recipe_url}")
# Be respectful - add delay between requests
time.sleep(2)
return recipes
def __del__(self):
"""Clean up Selenium driver"""
if hasattr(self, 'driver'):
try:
self.driver.quit()
except:
pass
def main():
# Option 1: Try with Selenium first (handles JavaScript), fallback to requests
print("=== Gousto Recipe Scraper ===")
print("Choose scraping method:")
print("1. Selenium (recommended - handles JavaScript)")
print("2. Requests only (faster, but may miss content)")
try:
choice = input("Enter choice (1 or 2, default=1): ").strip()
if choice == '2':
use_selenium = False
print("Using requests-only method...")
else:
use_selenium = True
print("Attempting to use Selenium...")
except (EOFError, KeyboardInterrupt):
# Handle case where input() isn't available (like in some environments)
use_selenium = True
print("Defaulting to Selenium method...")
scraper = GoustoScraper(use_selenium=use_selenium)
try:
# URL to scrape
start_url = "https://www.gousto.co.uk/cookbook/recipes?page=1"
# Scrape recipes (limit to 5 for testing)
recipes = scraper.scrape_recipes(start_url, max_recipes=5)
# Save to JSON file
if recipes:
with open('gousto_recipes.json', 'w', encoding='utf-8') as f:
json.dump(recipes, f, indent=2, ensure_ascii=False)
print(f"\n🎉 Successfully scraped {len(recipes)} recipes!")
print("Data saved to 'gousto_recipes.json'")
# Print sample data with structured ingredients
if recipes:
print("\nSample recipe data:")
sample = recipes[0]
print(f"Title: {sample['title']}")
print(f"Image: {sample['image_url']}")
print(f"Prep Time: {sample['prep_time']}")
print(f"Serving: {sample['serving']}")
print(f"Ingredients: {len(sample['ingredients'])} items")
# Show first few ingredients in structured format
if sample['ingredients']:
print("Sample ingredients:")
for i, ing in enumerate(sample['ingredients'][:3]):
print(f" {i+1}. {ing['amount']} {ing['unit']} {ing['name']}".strip())
print(f"Steps: {len(sample['steps'])} steps")
else:
print("\n❌ No recipes were successfully scraped.")
print("This might be due to:")
print("- Website changes")
print("- Anti-bot measures")
print("- Network issues")
finally:
# Clean up
del scraper
if __name__ == "__main__":
main()