Init commit with working script
This commit is contained in:
56
.gitignore
vendored
Normal file
56
.gitignore
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# Virtual Environment
|
||||
venv/
|
||||
ENV/
|
||||
env/
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# Project specific
|
||||
gousto_recipes.json
|
||||
.DS_Store
|
||||
|
||||
# Logs and databases
|
||||
*.log
|
||||
*.sqlite3
|
||||
|
||||
# Environment variables
|
||||
.env
|
||||
.env.local
|
||||
.env.*.local
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# Local development settings
|
||||
local_settings.py
|
||||
|
||||
# Selenium
|
||||
*.crx
|
||||
*.pem
|
||||
73
README.md
Normal file
73
README.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# Gousto Recipe Scraper
|
||||
|
||||
A Python script to scrape recipe data from Gousto's website and save it to a JSON file.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.7+
|
||||
- Chrome or Chromium browser (for Selenium)
|
||||
- ChromeDriver (will be installed automatically by webdriver-manager)
|
||||
|
||||
## Setup
|
||||
|
||||
1. Clone this repository:
|
||||
```bash
|
||||
git clone <repository-url>
|
||||
cd gousto-scraper
|
||||
```
|
||||
|
||||
2. Create and activate a virtual environment:
|
||||
```bash
|
||||
# On Linux/MacOS
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
|
||||
# On Windows
|
||||
python -m venv venv
|
||||
.\venv\Scripts\activate
|
||||
```
|
||||
|
||||
3. Install the required packages:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Run the scraper with the following command:
|
||||
|
||||
```bash
|
||||
python scraper.py
|
||||
```
|
||||
|
||||
This will:
|
||||
1. Scrape recipe data from Gousto's website
|
||||
2. Save the results to `gousto_recipes.json`
|
||||
|
||||
### Options
|
||||
|
||||
- `--use-selenium` (default: True): Use Selenium for JavaScript rendering
|
||||
- `--headless` (default: True): Run browser in headless mode
|
||||
- `--max-pages`: Maximum number of recipe pages to scrape (default: all)
|
||||
- `--output`: Output JSON file path (default: gousto_recipes.json)
|
||||
|
||||
Example:
|
||||
```bash
|
||||
python scraper.py --max-pages 5 --output recipes.json
|
||||
```
|
||||
|
||||
## Output
|
||||
|
||||
The script saves the scraped data to a JSON file containing an array of recipe objects, each including:
|
||||
- Title
|
||||
- Description
|
||||
- Ingredients
|
||||
- Cooking time
|
||||
- Nutritional information
|
||||
- And more
|
||||
|
||||
## Notes
|
||||
|
||||
- This script is for educational purposes only
|
||||
- Be respectful of Gousto's website - don't make too many requests in a short period
|
||||
- The website structure might change over time, which could break the scraper
|
||||
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
selenium==4.15.2
|
||||
webdriver-manager==4.0.1
|
||||
beautifulsoup4==4.12.2
|
||||
lxml==4.9.3
|
||||
674
scraper.py
Normal file
674
scraper.py
Normal file
@@ -0,0 +1,674 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import re
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import time
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||
|
||||
class GoustoScraper:
|
||||
def __init__(self, use_selenium=True):
|
||||
self.base_url = "https://www.gousto.co.uk"
|
||||
self.use_selenium = use_selenium
|
||||
|
||||
if use_selenium:
|
||||
# Setup Selenium WebDriver with more robust options
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument('--headless=new') # Use new headless mode
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--disable-extensions')
|
||||
chrome_options.add_argument('--disable-background-timer-throttling')
|
||||
chrome_options.add_argument('--disable-renderer-backgrounding')
|
||||
chrome_options.add_argument('--disable-backgrounding-occluded-windows')
|
||||
chrome_options.add_argument('--disable-ipc-flooding-protection')
|
||||
chrome_options.add_argument('--window-size=1920,1080')
|
||||
chrome_options.add_argument('--remote-debugging-port=9222')
|
||||
chrome_options.add_argument('--disable-features=TranslateUI')
|
||||
chrome_options.add_argument('--disable-default-apps')
|
||||
chrome_options.add_argument('--disable-logging')
|
||||
chrome_options.add_argument('--disable-plugins')
|
||||
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
|
||||
|
||||
# Set additional options for stability
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
|
||||
try:
|
||||
# Try different methods to initialize Chrome
|
||||
print("Attempting to initialize Chrome WebDriver...")
|
||||
|
||||
# Method 1: Try with webdriver-manager (if available)
|
||||
try:
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
|
||||
service = Service(ChromeDriverManager().install())
|
||||
self.driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
print("✓ Selenium WebDriver initialized with webdriver-manager")
|
||||
|
||||
except ImportError:
|
||||
print("webdriver-manager not available, trying system Chrome...")
|
||||
# Method 2: Try system Chrome
|
||||
self.driver = webdriver.Chrome(options=chrome_options)
|
||||
print("✓ Selenium WebDriver initialized with system Chrome")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed with system Chrome: {e}")
|
||||
# Method 3: Try with explicit Chrome binary path
|
||||
chrome_options.binary_location = "/usr/bin/google-chrome" # Common Linux path
|
||||
self.driver = webdriver.Chrome(options=chrome_options)
|
||||
print("✓ Selenium WebDriver initialized with explicit binary path")
|
||||
|
||||
self.wait = WebDriverWait(self.driver, 10)
|
||||
|
||||
# Test the driver
|
||||
self.driver.get("https://www.google.com")
|
||||
print("✓ WebDriver test successful")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ All Selenium initialization methods failed: {e}")
|
||||
print("Common solutions:")
|
||||
print("1. Install Chrome browser: sudo apt-get install google-chrome-stable")
|
||||
print("2. Install webdriver-manager: pip install webdriver-manager")
|
||||
print("3. Download ChromeDriver manually from https://chromedriver.chromium.org/")
|
||||
print("4. Make sure Chrome and ChromeDriver versions match")
|
||||
print("\nFalling back to requests method...")
|
||||
self.use_selenium = False
|
||||
self._init_requests_session()
|
||||
else:
|
||||
self._init_requests_session()
|
||||
|
||||
def _init_requests_session(self):
|
||||
"""Initialize requests session as fallback"""
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
})
|
||||
|
||||
def get_page_content_selenium(self, url):
|
||||
"""Get page content using Selenium (handles JavaScript)"""
|
||||
try:
|
||||
print(f"Loading page with Selenium: {url}")
|
||||
self.driver.get(url)
|
||||
|
||||
# Wait for recipe cards to load
|
||||
try:
|
||||
# Wait for recipe elements to be present
|
||||
self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "a")))
|
||||
|
||||
# Scroll down to trigger lazy loading
|
||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
time.sleep(2)
|
||||
|
||||
# Scroll back up
|
||||
self.driver.execute_script("window.scrollTo(0, 0);")
|
||||
time.sleep(1)
|
||||
|
||||
except TimeoutException:
|
||||
print("Timeout waiting for page elements to load")
|
||||
|
||||
return self.driver.page_source
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading page with Selenium: {e}")
|
||||
return None
|
||||
|
||||
def get_page_content_requests(self, url):
|
||||
"""Get page content using requests (static HTML only)"""
|
||||
try:
|
||||
response = self.session.get(url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except requests.RequestException as e:
|
||||
print(f"Error fetching page with requests: {e}")
|
||||
return None
|
||||
|
||||
def get_recipe_links(self, page_url):
|
||||
"""Extract recipe links from the cookbook page"""
|
||||
# Get page content
|
||||
if self.use_selenium:
|
||||
html_content = self.get_page_content_selenium(page_url)
|
||||
else:
|
||||
html_content = self.get_page_content_requests(page_url)
|
||||
|
||||
if not html_content:
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Debug: Print some of the HTML to see what we're working with
|
||||
print("Analyzing page structure...")
|
||||
print(f"Page title: {soup.title.string if soup.title else 'No title found'}")
|
||||
|
||||
# More comprehensive selectors for Gousto
|
||||
recipe_links = set() # Use set to avoid duplicates
|
||||
|
||||
# Different selector strategies
|
||||
selectors = [
|
||||
# Try different link patterns
|
||||
'a[href*="/cookbook/recipes/"]:not([href*="?page="])',
|
||||
'a[href^="/cookbook/recipes/"]:not([href*="?"])',
|
||||
'[data-test-id*="recipe"] a',
|
||||
'[data-testid*="recipe"] a',
|
||||
'.recipe-card a',
|
||||
'.recipe-tile a',
|
||||
'.recipe a',
|
||||
'article a[href*="/cookbook/recipes/"]',
|
||||
'div[class*="recipe"] a',
|
||||
# More generic approaches
|
||||
'a[href*="/recipes/"]:not([href*="?page="])',
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
try:
|
||||
links = soup.select(selector)
|
||||
print(f"Selector '{selector}' found {len(links)} links")
|
||||
|
||||
for link in links:
|
||||
href = link.get('href')
|
||||
if href:
|
||||
# Clean and validate the URL
|
||||
if href.startswith('/'):
|
||||
full_url = urljoin(self.base_url, href)
|
||||
else:
|
||||
full_url = href
|
||||
|
||||
# Filter out pagination and non-recipe links
|
||||
if ('/cookbook/recipes/' in full_url and
|
||||
'?page=' not in full_url and
|
||||
full_url != page_url):
|
||||
recipe_links.add(full_url)
|
||||
|
||||
if recipe_links:
|
||||
break # If we found links with this selector, stop trying others
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error with selector '{selector}': {e}")
|
||||
continue
|
||||
|
||||
recipe_links = list(recipe_links)
|
||||
print(f"Total unique recipe links found: {len(recipe_links)}")
|
||||
|
||||
# Debug: Print first few links
|
||||
for i, link in enumerate(recipe_links[:3]):
|
||||
print(f"Sample link {i+1}: {link}")
|
||||
|
||||
return recipe_links
|
||||
|
||||
def filter_steps(self, raw_steps, recipe_title=''):
|
||||
"""Filter out unwanted items from steps array (breadcrumbs, navigation, etc.)"""
|
||||
if not raw_steps:
|
||||
return []
|
||||
|
||||
filtered_steps = []
|
||||
|
||||
# Common unwanted step content to filter out
|
||||
unwanted_patterns = [
|
||||
'cookbook',
|
||||
'recipes',
|
||||
'home',
|
||||
'menu',
|
||||
'navigation',
|
||||
'skip to',
|
||||
'back to',
|
||||
'print recipe',
|
||||
'save recipe',
|
||||
'rate this recipe',
|
||||
'share',
|
||||
'pinterest',
|
||||
'facebook',
|
||||
'twitter',
|
||||
'instagram'
|
||||
]
|
||||
|
||||
# Convert recipe title to lowercase for comparison
|
||||
recipe_title_lower = recipe_title.lower() if recipe_title else ''
|
||||
|
||||
for i, step in enumerate(raw_steps):
|
||||
step_lower = step.lower().strip()
|
||||
|
||||
# Skip empty steps
|
||||
if not step_lower:
|
||||
continue
|
||||
|
||||
# Skip if it's exactly "Cookbook"
|
||||
if step_lower == 'cookbook':
|
||||
continue
|
||||
|
||||
# Skip if it matches the recipe title exactly
|
||||
if recipe_title_lower and step_lower == recipe_title_lower:
|
||||
continue
|
||||
|
||||
# Skip if it's very short and likely navigation (less than 10 chars)
|
||||
if len(step) < 10 and any(pattern in step_lower for pattern in unwanted_patterns):
|
||||
continue
|
||||
|
||||
# Skip if it contains unwanted patterns and is short (likely navigation)
|
||||
if len(step) < 50 and any(pattern in step_lower for pattern in unwanted_patterns):
|
||||
continue
|
||||
|
||||
# Skip steps that are just numbers (step numbers without content)
|
||||
if step.strip().isdigit():
|
||||
continue
|
||||
|
||||
# Skip if it's just a single word that's likely navigation
|
||||
if len(step.split()) == 1 and len(step) < 15:
|
||||
# But keep single words that are likely cooking instructions
|
||||
cooking_words = ['preheat', 'heat', 'boil', 'simmer', 'bake', 'fry', 'mix', 'stir', 'serve']
|
||||
if not any(word in step_lower for word in cooking_words):
|
||||
continue
|
||||
|
||||
# Skip if it looks like a breadcrumb pattern (word > word > word)
|
||||
if '>' in step or '»' in step or ('/' in step and len(step.split()) <= 4):
|
||||
continue
|
||||
|
||||
# If we've made it this far, it's likely a real cooking step
|
||||
filtered_steps.append(step.strip())
|
||||
|
||||
# Additional filtering: remove the first few items if they still look like breadcrumbs
|
||||
# This handles cases where the first 1-2 items are navigation that slipped through
|
||||
if len(filtered_steps) >= 2:
|
||||
# Check if first item is very short and generic
|
||||
first_item = filtered_steps[0].lower()
|
||||
if (len(first_item) < 20 and
|
||||
any(pattern in first_item for pattern in ['cookbook', 'recipe', 'home', 'menu'])):
|
||||
filtered_steps = filtered_steps[1:]
|
||||
|
||||
# Check if second item is the recipe title or similar
|
||||
if (len(filtered_steps) >= 2 and recipe_title_lower and
|
||||
filtered_steps[0].lower() == recipe_title_lower):
|
||||
filtered_steps = filtered_steps[1:]
|
||||
|
||||
return filtered_steps
|
||||
|
||||
def parse_ingredient(self, ingredient_text):
|
||||
"""Parse ingredient text into structured format with name, amount, and unit"""
|
||||
ingredient_text = ingredient_text.strip()
|
||||
|
||||
# Common units and their variations
|
||||
units = {
|
||||
# Volume
|
||||
'ml', 'millilitre', 'milliliters', 'millilitres',
|
||||
'l', 'litre', 'liter', 'litres', 'liters',
|
||||
'cup', 'cups', 'c',
|
||||
'tbsp', 'tablespoon', 'tablespoons', 'tbs', 'tb',
|
||||
'tsp', 'teaspoon', 'teaspoons', 'ts',
|
||||
'fl oz', 'fluid ounce', 'fluid ounces',
|
||||
'pint', 'pints', 'pt',
|
||||
|
||||
# Weight
|
||||
'g', 'gram', 'grams', 'gr',
|
||||
'kg', 'kilogram', 'kilograms',
|
||||
'oz', 'ounce', 'ounces',
|
||||
'lb', 'pound', 'pounds', 'lbs',
|
||||
|
||||
# Count/pieces
|
||||
'piece', 'pieces', 'pc', 'pcs',
|
||||
'slice', 'slices',
|
||||
'clove', 'cloves',
|
||||
'bunch', 'bunches',
|
||||
'handful', 'handfuls',
|
||||
'pinch', 'pinches',
|
||||
'dash', 'dashes',
|
||||
'sprig', 'sprigs',
|
||||
'stick', 'sticks',
|
||||
'can', 'cans', 'tin', 'tins',
|
||||
'jar', 'jars',
|
||||
'packet', 'packets', 'pack', 'packs',
|
||||
'box', 'boxes',
|
||||
'bag', 'bags',
|
||||
|
||||
# Special measurements
|
||||
'to taste', 'as needed', 'optional'
|
||||
}
|
||||
|
||||
# Create a pattern to match numbers (including fractions and decimals)
|
||||
number_pattern = r'\d+(?:[/.]\d+)?(?:\.\d+)?|\d*\.\d+'
|
||||
|
||||
# Try to extract amount and unit
|
||||
ingredient = {
|
||||
'name': ingredient_text,
|
||||
'amount': '',
|
||||
'unit': ''
|
||||
}
|
||||
|
||||
# Pattern 1: "2 cups flour" or "250g butter"
|
||||
pattern1 = r'^(' + number_pattern + r')\s*([a-zA-Z\s]+?)\s+(.+)$'
|
||||
match1 = re.match(pattern1, ingredient_text, re.IGNORECASE)
|
||||
|
||||
if match1:
|
||||
amount = match1.group(1)
|
||||
potential_unit = match1.group(2).strip().lower()
|
||||
name = match1.group(3).strip()
|
||||
|
||||
# Check if the potential unit is in our units list
|
||||
if any(unit in potential_unit for unit in units):
|
||||
ingredient['amount'] = amount
|
||||
ingredient['unit'] = match1.group(2).strip() # Keep original case
|
||||
ingredient['name'] = name
|
||||
return ingredient
|
||||
|
||||
# Pattern 2: "flour - 2 cups" or "butter (250g)"
|
||||
pattern2 = r'^(.+?)[\s\-\(]+(' + number_pattern + r')\s*([a-zA-Z\s]*?)[\)\s]*$'
|
||||
match2 = re.match(pattern2, ingredient_text, re.IGNORECASE)
|
||||
|
||||
if match2:
|
||||
name = match2.group(1).strip()
|
||||
amount = match2.group(2)
|
||||
potential_unit = match2.group(3).strip().lower()
|
||||
|
||||
if any(unit in potential_unit for unit in units) or not potential_unit:
|
||||
ingredient['name'] = name
|
||||
ingredient['amount'] = amount
|
||||
ingredient['unit'] = match2.group(3).strip() # Keep original case
|
||||
return ingredient
|
||||
|
||||
# Pattern 3: Just amount at start "2 onions" or "1 large egg"
|
||||
pattern3 = r'^(' + number_pattern + r')\s+(.+)$'
|
||||
match3 = re.match(pattern3, ingredient_text, re.IGNORECASE)
|
||||
|
||||
if match3:
|
||||
amount = match3.group(1)
|
||||
rest = match3.group(2).strip()
|
||||
|
||||
# Check if it starts with a unit
|
||||
words = rest.split()
|
||||
if words and words[0].lower() in units:
|
||||
ingredient['amount'] = amount
|
||||
ingredient['unit'] = words[0]
|
||||
ingredient['name'] = ' '.join(words[1:]) if len(words) > 1 else rest
|
||||
return ingredient
|
||||
else:
|
||||
# No explicit unit, treat as pieces/items
|
||||
ingredient['amount'] = amount
|
||||
ingredient['unit'] = ''
|
||||
ingredient['name'] = rest
|
||||
return ingredient
|
||||
|
||||
# Pattern 4: Fractional amounts "1/2 cup sugar"
|
||||
fraction_pattern = r'^(\d+/\d+)\s+([a-zA-Z\s]+?)\s+(.+)$'
|
||||
match4 = re.match(fraction_pattern, ingredient_text, re.IGNORECASE)
|
||||
|
||||
if match4:
|
||||
amount = match4.group(1)
|
||||
potential_unit = match4.group(2).strip().lower()
|
||||
name = match4.group(3).strip()
|
||||
|
||||
if any(unit in potential_unit for unit in units):
|
||||
ingredient['amount'] = amount
|
||||
ingredient['unit'] = match4.group(2).strip()
|
||||
ingredient['name'] = name
|
||||
return ingredient
|
||||
|
||||
# If no patterns match, return as-is (some ingredients might not have amounts)
|
||||
return ingredient
|
||||
|
||||
def scrape_recipe_details(self, recipe_url):
|
||||
"""Scrape detailed recipe information from individual recipe page"""
|
||||
try:
|
||||
if self.use_selenium:
|
||||
html_content = self.get_page_content_selenium(recipe_url)
|
||||
else:
|
||||
response = self.session.get(recipe_url)
|
||||
response.raise_for_status()
|
||||
html_content = response.text
|
||||
|
||||
if not html_content:
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
recipe_data = {
|
||||
'url': recipe_url,
|
||||
'title': '',
|
||||
'image_url': '',
|
||||
'prep_time': '',
|
||||
'serving': '',
|
||||
'ingredients': [],
|
||||
'description': '',
|
||||
'steps': []
|
||||
}
|
||||
|
||||
# Extract title with more selectors
|
||||
title_selectors = [
|
||||
'h1',
|
||||
'[data-test-id*="title"]',
|
||||
'[data-testid*="title"]',
|
||||
'.recipe-title',
|
||||
'.title',
|
||||
'title'
|
||||
]
|
||||
for selector in title_selectors:
|
||||
title_elem = soup.select_one(selector)
|
||||
if title_elem and title_elem.get_text(strip=True):
|
||||
recipe_data['title'] = title_elem.get_text(strip=True)
|
||||
break
|
||||
|
||||
# Extract image URL with more selectors
|
||||
img_selectors = [
|
||||
'img[src*="recipe"]',
|
||||
'img[alt*="recipe"]',
|
||||
'.recipe-image img',
|
||||
'.hero-image img',
|
||||
'img[src*="gousto"]',
|
||||
'main img',
|
||||
'img[src*="cloudinary"]' # Common CDN for food images
|
||||
]
|
||||
for selector in img_selectors:
|
||||
img_elem = soup.select_one(selector)
|
||||
if img_elem:
|
||||
src = img_elem.get('src') or img_elem.get('data-src') or img_elem.get('data-lazy')
|
||||
if src and ('recipe' in src.lower() or 'gousto' in src.lower()):
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = urljoin(self.base_url, src)
|
||||
recipe_data['image_url'] = src
|
||||
break
|
||||
|
||||
# Extract prep time
|
||||
time_patterns = [
|
||||
r'(\d+)\s*min',
|
||||
r'(\d+)\s*hour',
|
||||
r'(\d+)\s*hr'
|
||||
]
|
||||
|
||||
time_text = soup.get_text()
|
||||
for pattern in time_patterns:
|
||||
match = re.search(pattern, time_text, re.IGNORECASE)
|
||||
if match:
|
||||
recipe_data['prep_time'] = match.group(0)
|
||||
break
|
||||
|
||||
# Extract serving information
|
||||
serving_pattern = r'(\d+)\s*(?:serve|serving|portion|people)'
|
||||
serving_match = re.search(serving_pattern, soup.get_text(), re.IGNORECASE)
|
||||
if serving_match:
|
||||
recipe_data['serving'] = serving_match.group(0)
|
||||
|
||||
# Extract ingredients and parse them into structured format
|
||||
ingredient_selectors = [
|
||||
'.ingredient',
|
||||
'.ingredients li',
|
||||
'[data-test-id*="ingredient"]',
|
||||
'ul[class*="ingredient"] li',
|
||||
'.recipe-ingredients li'
|
||||
]
|
||||
|
||||
for selector in ingredient_selectors:
|
||||
ingredients = soup.select(selector)
|
||||
if ingredients:
|
||||
raw_ingredients = [ing.get_text(strip=True) for ing in ingredients if ing.get_text(strip=True)]
|
||||
# Parse each ingredient into structured format
|
||||
recipe_data['ingredients'] = [self.parse_ingredient(ing) for ing in raw_ingredients]
|
||||
break
|
||||
|
||||
# Extract description
|
||||
desc_selectors = [
|
||||
'.description',
|
||||
'.recipe-description',
|
||||
'meta[name="description"]',
|
||||
'.intro',
|
||||
'p'
|
||||
]
|
||||
|
||||
for selector in desc_selectors:
|
||||
if selector == 'meta[name="description"]':
|
||||
desc_elem = soup.select_one(selector)
|
||||
if desc_elem:
|
||||
recipe_data['description'] = desc_elem.get('content', '').strip()
|
||||
if recipe_data['description']:
|
||||
break
|
||||
else:
|
||||
desc_elem = soup.select_one(selector)
|
||||
if desc_elem and len(desc_elem.get_text(strip=True)) > 20:
|
||||
recipe_data['description'] = desc_elem.get_text(strip=True)
|
||||
break
|
||||
|
||||
# Extract steps
|
||||
step_selectors = [
|
||||
'.step',
|
||||
'.instruction',
|
||||
'.method li',
|
||||
'.directions li',
|
||||
'ol li',
|
||||
'[data-test-id*="step"]'
|
||||
]
|
||||
|
||||
for selector in step_selectors:
|
||||
steps = soup.select(selector)
|
||||
if steps:
|
||||
raw_steps = [step.get_text(strip=True) for step in steps if step.get_text(strip=True)]
|
||||
# Filter out unwanted steps (breadcrumbs, navigation, etc.)
|
||||
recipe_data['steps'] = self.filter_steps(raw_steps, recipe_data.get('title', ''))
|
||||
break
|
||||
|
||||
return recipe_data
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping recipe {recipe_url}: {e}")
|
||||
return None
|
||||
|
||||
def scrape_recipes(self, start_url, max_recipes=10):
|
||||
"""Main method to scrape recipes"""
|
||||
print(f"Starting to scrape recipes from: {start_url}")
|
||||
print(f"Using {'Selenium' if self.use_selenium else 'Requests'} for web scraping")
|
||||
|
||||
# Get recipe links from the main page
|
||||
recipe_links = self.get_recipe_links(start_url)
|
||||
|
||||
if not recipe_links:
|
||||
print("No recipe links found. Possible reasons:")
|
||||
print("1. Website structure has changed")
|
||||
print("2. Content is loaded dynamically and requires JavaScript")
|
||||
print("3. Website is blocking automated requests")
|
||||
return []
|
||||
|
||||
# Limit the number of recipes to scrape
|
||||
recipe_links = recipe_links[:max_recipes]
|
||||
|
||||
recipes = []
|
||||
|
||||
for i, recipe_url in enumerate(recipe_links, 1):
|
||||
print(f"\nScraping recipe {i}/{len(recipe_links)}: {recipe_url}")
|
||||
|
||||
recipe_data = self.scrape_recipe_details(recipe_url)
|
||||
|
||||
if recipe_data and recipe_data['title']:
|
||||
recipes.append(recipe_data)
|
||||
print(f"✓ Successfully scraped: {recipe_data['title']}")
|
||||
else:
|
||||
print(f"✗ Failed to scrape recipe: {recipe_url}")
|
||||
|
||||
# Be respectful - add delay between requests
|
||||
time.sleep(2)
|
||||
|
||||
return recipes
|
||||
|
||||
def __del__(self):
|
||||
"""Clean up Selenium driver"""
|
||||
if hasattr(self, 'driver'):
|
||||
try:
|
||||
self.driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
def main():
|
||||
# Option 1: Try with Selenium first (handles JavaScript), fallback to requests
|
||||
print("=== Gousto Recipe Scraper ===")
|
||||
print("Choose scraping method:")
|
||||
print("1. Selenium (recommended - handles JavaScript)")
|
||||
print("2. Requests only (faster, but may miss content)")
|
||||
|
||||
try:
|
||||
choice = input("Enter choice (1 or 2, default=1): ").strip()
|
||||
if choice == '2':
|
||||
use_selenium = False
|
||||
print("Using requests-only method...")
|
||||
else:
|
||||
use_selenium = True
|
||||
print("Attempting to use Selenium...")
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
# Handle case where input() isn't available (like in some environments)
|
||||
use_selenium = True
|
||||
print("Defaulting to Selenium method...")
|
||||
|
||||
scraper = GoustoScraper(use_selenium=use_selenium)
|
||||
|
||||
try:
|
||||
# URL to scrape
|
||||
start_url = "https://www.gousto.co.uk/cookbook/recipes?page=1"
|
||||
|
||||
# Scrape recipes (limit to 5 for testing)
|
||||
recipes = scraper.scrape_recipes(start_url, max_recipes=5)
|
||||
|
||||
# Save to JSON file
|
||||
if recipes:
|
||||
with open('gousto_recipes.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(recipes, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n🎉 Successfully scraped {len(recipes)} recipes!")
|
||||
print("Data saved to 'gousto_recipes.json'")
|
||||
|
||||
# Print sample data with structured ingredients
|
||||
if recipes:
|
||||
print("\nSample recipe data:")
|
||||
sample = recipes[0]
|
||||
print(f"Title: {sample['title']}")
|
||||
print(f"Image: {sample['image_url']}")
|
||||
print(f"Prep Time: {sample['prep_time']}")
|
||||
print(f"Serving: {sample['serving']}")
|
||||
print(f"Ingredients: {len(sample['ingredients'])} items")
|
||||
|
||||
# Show first few ingredients in structured format
|
||||
if sample['ingredients']:
|
||||
print("Sample ingredients:")
|
||||
for i, ing in enumerate(sample['ingredients'][:3]):
|
||||
print(f" {i+1}. {ing['amount']} {ing['unit']} {ing['name']}".strip())
|
||||
|
||||
print(f"Steps: {len(sample['steps'])} steps")
|
||||
else:
|
||||
print("\n❌ No recipes were successfully scraped.")
|
||||
print("This might be due to:")
|
||||
print("- Website changes")
|
||||
print("- Anti-bot measures")
|
||||
print("- Network issues")
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
del scraper
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user