Init commit with working script
This commit is contained in:
56
.gitignore
vendored
Normal file
56
.gitignore
vendored
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# Virtual Environment
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# Project specific
|
||||||
|
gousto_recipes.json
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
# Logs and databases
|
||||||
|
*.log
|
||||||
|
*.sqlite3
|
||||||
|
|
||||||
|
# Environment variables
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
|
.env.*.local
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# Local development settings
|
||||||
|
local_settings.py
|
||||||
|
|
||||||
|
# Selenium
|
||||||
|
*.crx
|
||||||
|
*.pem
|
||||||
73
README.md
Normal file
73
README.md
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
# Gousto Recipe Scraper
|
||||||
|
|
||||||
|
A Python script to scrape recipe data from Gousto's website and save it to a JSON file.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Python 3.7+
|
||||||
|
- Chrome or Chromium browser (for Selenium)
|
||||||
|
- ChromeDriver (will be installed automatically by webdriver-manager)
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
1. Clone this repository:
|
||||||
|
```bash
|
||||||
|
git clone <repository-url>
|
||||||
|
cd gousto-scraper
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Create and activate a virtual environment:
|
||||||
|
```bash
|
||||||
|
# On Linux/MacOS
|
||||||
|
python3 -m venv venv
|
||||||
|
source venv/bin/activate
|
||||||
|
|
||||||
|
# On Windows
|
||||||
|
python -m venv venv
|
||||||
|
.\venv\Scripts\activate
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Install the required packages:
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Run the scraper with the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scraper.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This will:
|
||||||
|
1. Scrape recipe data from Gousto's website
|
||||||
|
2. Save the results to `gousto_recipes.json`
|
||||||
|
|
||||||
|
### Options
|
||||||
|
|
||||||
|
- `--use-selenium` (default: True): Use Selenium for JavaScript rendering
|
||||||
|
- `--headless` (default: True): Run browser in headless mode
|
||||||
|
- `--max-pages`: Maximum number of recipe pages to scrape (default: all)
|
||||||
|
- `--output`: Output JSON file path (default: gousto_recipes.json)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```bash
|
||||||
|
python scraper.py --max-pages 5 --output recipes.json
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output
|
||||||
|
|
||||||
|
The script saves the scraped data to a JSON file containing an array of recipe objects, each including:
|
||||||
|
- Title
|
||||||
|
- Description
|
||||||
|
- Ingredients
|
||||||
|
- Cooking time
|
||||||
|
- Nutritional information
|
||||||
|
- And more
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- This script is for educational purposes only
|
||||||
|
- Be respectful of Gousto's website - don't make too many requests in a short period
|
||||||
|
- The website structure might change over time, which could break the scraper
|
||||||
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
selenium==4.15.2
|
||||||
|
webdriver-manager==4.0.1
|
||||||
|
beautifulsoup4==4.12.2
|
||||||
|
lxml==4.9.3
|
||||||
674
scraper.py
Normal file
674
scraper.py
Normal file
@@ -0,0 +1,674 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
import time
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||||
|
|
||||||
|
class GoustoScraper:
|
||||||
|
def __init__(self, use_selenium=True):
|
||||||
|
self.base_url = "https://www.gousto.co.uk"
|
||||||
|
self.use_selenium = use_selenium
|
||||||
|
|
||||||
|
if use_selenium:
|
||||||
|
# Setup Selenium WebDriver with more robust options
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument('--headless=new') # Use new headless mode
|
||||||
|
chrome_options.add_argument('--no-sandbox')
|
||||||
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||||
|
chrome_options.add_argument('--disable-gpu')
|
||||||
|
chrome_options.add_argument('--disable-extensions')
|
||||||
|
chrome_options.add_argument('--disable-background-timer-throttling')
|
||||||
|
chrome_options.add_argument('--disable-renderer-backgrounding')
|
||||||
|
chrome_options.add_argument('--disable-backgrounding-occluded-windows')
|
||||||
|
chrome_options.add_argument('--disable-ipc-flooding-protection')
|
||||||
|
chrome_options.add_argument('--window-size=1920,1080')
|
||||||
|
chrome_options.add_argument('--remote-debugging-port=9222')
|
||||||
|
chrome_options.add_argument('--disable-features=TranslateUI')
|
||||||
|
chrome_options.add_argument('--disable-default-apps')
|
||||||
|
chrome_options.add_argument('--disable-logging')
|
||||||
|
chrome_options.add_argument('--disable-plugins')
|
||||||
|
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
|
||||||
|
|
||||||
|
# Set additional options for stability
|
||||||
|
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||||
|
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Try different methods to initialize Chrome
|
||||||
|
print("Attempting to initialize Chrome WebDriver...")
|
||||||
|
|
||||||
|
# Method 1: Try with webdriver-manager (if available)
|
||||||
|
try:
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
|
||||||
|
service = Service(ChromeDriverManager().install())
|
||||||
|
self.driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||||
|
print("✓ Selenium WebDriver initialized with webdriver-manager")
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
print("webdriver-manager not available, trying system Chrome...")
|
||||||
|
# Method 2: Try system Chrome
|
||||||
|
self.driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
print("✓ Selenium WebDriver initialized with system Chrome")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed with system Chrome: {e}")
|
||||||
|
# Method 3: Try with explicit Chrome binary path
|
||||||
|
chrome_options.binary_location = "/usr/bin/google-chrome" # Common Linux path
|
||||||
|
self.driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
print("✓ Selenium WebDriver initialized with explicit binary path")
|
||||||
|
|
||||||
|
self.wait = WebDriverWait(self.driver, 10)
|
||||||
|
|
||||||
|
# Test the driver
|
||||||
|
self.driver.get("https://www.google.com")
|
||||||
|
print("✓ WebDriver test successful")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ All Selenium initialization methods failed: {e}")
|
||||||
|
print("Common solutions:")
|
||||||
|
print("1. Install Chrome browser: sudo apt-get install google-chrome-stable")
|
||||||
|
print("2. Install webdriver-manager: pip install webdriver-manager")
|
||||||
|
print("3. Download ChromeDriver manually from https://chromedriver.chromium.org/")
|
||||||
|
print("4. Make sure Chrome and ChromeDriver versions match")
|
||||||
|
print("\nFalling back to requests method...")
|
||||||
|
self.use_selenium = False
|
||||||
|
self._init_requests_session()
|
||||||
|
else:
|
||||||
|
self._init_requests_session()
|
||||||
|
|
||||||
|
def _init_requests_session(self):
|
||||||
|
"""Initialize requests session as fallback"""
|
||||||
|
self.session = requests.Session()
|
||||||
|
self.session.headers.update({
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.5',
|
||||||
|
'Accept-Encoding': 'gzip, deflate',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
})
|
||||||
|
|
||||||
|
def get_page_content_selenium(self, url):
|
||||||
|
"""Get page content using Selenium (handles JavaScript)"""
|
||||||
|
try:
|
||||||
|
print(f"Loading page with Selenium: {url}")
|
||||||
|
self.driver.get(url)
|
||||||
|
|
||||||
|
# Wait for recipe cards to load
|
||||||
|
try:
|
||||||
|
# Wait for recipe elements to be present
|
||||||
|
self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "a")))
|
||||||
|
|
||||||
|
# Scroll down to trigger lazy loading
|
||||||
|
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# Scroll back up
|
||||||
|
self.driver.execute_script("window.scrollTo(0, 0);")
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
except TimeoutException:
|
||||||
|
print("Timeout waiting for page elements to load")
|
||||||
|
|
||||||
|
return self.driver.page_source
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading page with Selenium: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_page_content_requests(self, url):
|
||||||
|
"""Get page content using requests (static HTML only)"""
|
||||||
|
try:
|
||||||
|
response = self.session.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.text
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching page with requests: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_recipe_links(self, page_url):
|
||||||
|
"""Extract recipe links from the cookbook page"""
|
||||||
|
# Get page content
|
||||||
|
if self.use_selenium:
|
||||||
|
html_content = self.get_page_content_selenium(page_url)
|
||||||
|
else:
|
||||||
|
html_content = self.get_page_content_requests(page_url)
|
||||||
|
|
||||||
|
if not html_content:
|
||||||
|
return []
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# Debug: Print some of the HTML to see what we're working with
|
||||||
|
print("Analyzing page structure...")
|
||||||
|
print(f"Page title: {soup.title.string if soup.title else 'No title found'}")
|
||||||
|
|
||||||
|
# More comprehensive selectors for Gousto
|
||||||
|
recipe_links = set() # Use set to avoid duplicates
|
||||||
|
|
||||||
|
# Different selector strategies
|
||||||
|
selectors = [
|
||||||
|
# Try different link patterns
|
||||||
|
'a[href*="/cookbook/recipes/"]:not([href*="?page="])',
|
||||||
|
'a[href^="/cookbook/recipes/"]:not([href*="?"])',
|
||||||
|
'[data-test-id*="recipe"] a',
|
||||||
|
'[data-testid*="recipe"] a',
|
||||||
|
'.recipe-card a',
|
||||||
|
'.recipe-tile a',
|
||||||
|
'.recipe a',
|
||||||
|
'article a[href*="/cookbook/recipes/"]',
|
||||||
|
'div[class*="recipe"] a',
|
||||||
|
# More generic approaches
|
||||||
|
'a[href*="/recipes/"]:not([href*="?page="])',
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in selectors:
|
||||||
|
try:
|
||||||
|
links = soup.select(selector)
|
||||||
|
print(f"Selector '{selector}' found {len(links)} links")
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
href = link.get('href')
|
||||||
|
if href:
|
||||||
|
# Clean and validate the URL
|
||||||
|
if href.startswith('/'):
|
||||||
|
full_url = urljoin(self.base_url, href)
|
||||||
|
else:
|
||||||
|
full_url = href
|
||||||
|
|
||||||
|
# Filter out pagination and non-recipe links
|
||||||
|
if ('/cookbook/recipes/' in full_url and
|
||||||
|
'?page=' not in full_url and
|
||||||
|
full_url != page_url):
|
||||||
|
recipe_links.add(full_url)
|
||||||
|
|
||||||
|
if recipe_links:
|
||||||
|
break # If we found links with this selector, stop trying others
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error with selector '{selector}': {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
recipe_links = list(recipe_links)
|
||||||
|
print(f"Total unique recipe links found: {len(recipe_links)}")
|
||||||
|
|
||||||
|
# Debug: Print first few links
|
||||||
|
for i, link in enumerate(recipe_links[:3]):
|
||||||
|
print(f"Sample link {i+1}: {link}")
|
||||||
|
|
||||||
|
return recipe_links
|
||||||
|
|
||||||
|
def filter_steps(self, raw_steps, recipe_title=''):
|
||||||
|
"""Filter out unwanted items from steps array (breadcrumbs, navigation, etc.)"""
|
||||||
|
if not raw_steps:
|
||||||
|
return []
|
||||||
|
|
||||||
|
filtered_steps = []
|
||||||
|
|
||||||
|
# Common unwanted step content to filter out
|
||||||
|
unwanted_patterns = [
|
||||||
|
'cookbook',
|
||||||
|
'recipes',
|
||||||
|
'home',
|
||||||
|
'menu',
|
||||||
|
'navigation',
|
||||||
|
'skip to',
|
||||||
|
'back to',
|
||||||
|
'print recipe',
|
||||||
|
'save recipe',
|
||||||
|
'rate this recipe',
|
||||||
|
'share',
|
||||||
|
'pinterest',
|
||||||
|
'facebook',
|
||||||
|
'twitter',
|
||||||
|
'instagram'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Convert recipe title to lowercase for comparison
|
||||||
|
recipe_title_lower = recipe_title.lower() if recipe_title else ''
|
||||||
|
|
||||||
|
for i, step in enumerate(raw_steps):
|
||||||
|
step_lower = step.lower().strip()
|
||||||
|
|
||||||
|
# Skip empty steps
|
||||||
|
if not step_lower:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip if it's exactly "Cookbook"
|
||||||
|
if step_lower == 'cookbook':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip if it matches the recipe title exactly
|
||||||
|
if recipe_title_lower and step_lower == recipe_title_lower:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip if it's very short and likely navigation (less than 10 chars)
|
||||||
|
if len(step) < 10 and any(pattern in step_lower for pattern in unwanted_patterns):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip if it contains unwanted patterns and is short (likely navigation)
|
||||||
|
if len(step) < 50 and any(pattern in step_lower for pattern in unwanted_patterns):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip steps that are just numbers (step numbers without content)
|
||||||
|
if step.strip().isdigit():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip if it's just a single word that's likely navigation
|
||||||
|
if len(step.split()) == 1 and len(step) < 15:
|
||||||
|
# But keep single words that are likely cooking instructions
|
||||||
|
cooking_words = ['preheat', 'heat', 'boil', 'simmer', 'bake', 'fry', 'mix', 'stir', 'serve']
|
||||||
|
if not any(word in step_lower for word in cooking_words):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip if it looks like a breadcrumb pattern (word > word > word)
|
||||||
|
if '>' in step or '»' in step or ('/' in step and len(step.split()) <= 4):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If we've made it this far, it's likely a real cooking step
|
||||||
|
filtered_steps.append(step.strip())
|
||||||
|
|
||||||
|
# Additional filtering: remove the first few items if they still look like breadcrumbs
|
||||||
|
# This handles cases where the first 1-2 items are navigation that slipped through
|
||||||
|
if len(filtered_steps) >= 2:
|
||||||
|
# Check if first item is very short and generic
|
||||||
|
first_item = filtered_steps[0].lower()
|
||||||
|
if (len(first_item) < 20 and
|
||||||
|
any(pattern in first_item for pattern in ['cookbook', 'recipe', 'home', 'menu'])):
|
||||||
|
filtered_steps = filtered_steps[1:]
|
||||||
|
|
||||||
|
# Check if second item is the recipe title or similar
|
||||||
|
if (len(filtered_steps) >= 2 and recipe_title_lower and
|
||||||
|
filtered_steps[0].lower() == recipe_title_lower):
|
||||||
|
filtered_steps = filtered_steps[1:]
|
||||||
|
|
||||||
|
return filtered_steps
|
||||||
|
|
||||||
|
def parse_ingredient(self, ingredient_text):
|
||||||
|
"""Parse ingredient text into structured format with name, amount, and unit"""
|
||||||
|
ingredient_text = ingredient_text.strip()
|
||||||
|
|
||||||
|
# Common units and their variations
|
||||||
|
units = {
|
||||||
|
# Volume
|
||||||
|
'ml', 'millilitre', 'milliliters', 'millilitres',
|
||||||
|
'l', 'litre', 'liter', 'litres', 'liters',
|
||||||
|
'cup', 'cups', 'c',
|
||||||
|
'tbsp', 'tablespoon', 'tablespoons', 'tbs', 'tb',
|
||||||
|
'tsp', 'teaspoon', 'teaspoons', 'ts',
|
||||||
|
'fl oz', 'fluid ounce', 'fluid ounces',
|
||||||
|
'pint', 'pints', 'pt',
|
||||||
|
|
||||||
|
# Weight
|
||||||
|
'g', 'gram', 'grams', 'gr',
|
||||||
|
'kg', 'kilogram', 'kilograms',
|
||||||
|
'oz', 'ounce', 'ounces',
|
||||||
|
'lb', 'pound', 'pounds', 'lbs',
|
||||||
|
|
||||||
|
# Count/pieces
|
||||||
|
'piece', 'pieces', 'pc', 'pcs',
|
||||||
|
'slice', 'slices',
|
||||||
|
'clove', 'cloves',
|
||||||
|
'bunch', 'bunches',
|
||||||
|
'handful', 'handfuls',
|
||||||
|
'pinch', 'pinches',
|
||||||
|
'dash', 'dashes',
|
||||||
|
'sprig', 'sprigs',
|
||||||
|
'stick', 'sticks',
|
||||||
|
'can', 'cans', 'tin', 'tins',
|
||||||
|
'jar', 'jars',
|
||||||
|
'packet', 'packets', 'pack', 'packs',
|
||||||
|
'box', 'boxes',
|
||||||
|
'bag', 'bags',
|
||||||
|
|
||||||
|
# Special measurements
|
||||||
|
'to taste', 'as needed', 'optional'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create a pattern to match numbers (including fractions and decimals)
|
||||||
|
number_pattern = r'\d+(?:[/.]\d+)?(?:\.\d+)?|\d*\.\d+'
|
||||||
|
|
||||||
|
# Try to extract amount and unit
|
||||||
|
ingredient = {
|
||||||
|
'name': ingredient_text,
|
||||||
|
'amount': '',
|
||||||
|
'unit': ''
|
||||||
|
}
|
||||||
|
|
||||||
|
# Pattern 1: "2 cups flour" or "250g butter"
|
||||||
|
pattern1 = r'^(' + number_pattern + r')\s*([a-zA-Z\s]+?)\s+(.+)$'
|
||||||
|
match1 = re.match(pattern1, ingredient_text, re.IGNORECASE)
|
||||||
|
|
||||||
|
if match1:
|
||||||
|
amount = match1.group(1)
|
||||||
|
potential_unit = match1.group(2).strip().lower()
|
||||||
|
name = match1.group(3).strip()
|
||||||
|
|
||||||
|
# Check if the potential unit is in our units list
|
||||||
|
if any(unit in potential_unit for unit in units):
|
||||||
|
ingredient['amount'] = amount
|
||||||
|
ingredient['unit'] = match1.group(2).strip() # Keep original case
|
||||||
|
ingredient['name'] = name
|
||||||
|
return ingredient
|
||||||
|
|
||||||
|
# Pattern 2: "flour - 2 cups" or "butter (250g)"
|
||||||
|
pattern2 = r'^(.+?)[\s\-\(]+(' + number_pattern + r')\s*([a-zA-Z\s]*?)[\)\s]*$'
|
||||||
|
match2 = re.match(pattern2, ingredient_text, re.IGNORECASE)
|
||||||
|
|
||||||
|
if match2:
|
||||||
|
name = match2.group(1).strip()
|
||||||
|
amount = match2.group(2)
|
||||||
|
potential_unit = match2.group(3).strip().lower()
|
||||||
|
|
||||||
|
if any(unit in potential_unit for unit in units) or not potential_unit:
|
||||||
|
ingredient['name'] = name
|
||||||
|
ingredient['amount'] = amount
|
||||||
|
ingredient['unit'] = match2.group(3).strip() # Keep original case
|
||||||
|
return ingredient
|
||||||
|
|
||||||
|
# Pattern 3: Just amount at start "2 onions" or "1 large egg"
|
||||||
|
pattern3 = r'^(' + number_pattern + r')\s+(.+)$'
|
||||||
|
match3 = re.match(pattern3, ingredient_text, re.IGNORECASE)
|
||||||
|
|
||||||
|
if match3:
|
||||||
|
amount = match3.group(1)
|
||||||
|
rest = match3.group(2).strip()
|
||||||
|
|
||||||
|
# Check if it starts with a unit
|
||||||
|
words = rest.split()
|
||||||
|
if words and words[0].lower() in units:
|
||||||
|
ingredient['amount'] = amount
|
||||||
|
ingredient['unit'] = words[0]
|
||||||
|
ingredient['name'] = ' '.join(words[1:]) if len(words) > 1 else rest
|
||||||
|
return ingredient
|
||||||
|
else:
|
||||||
|
# No explicit unit, treat as pieces/items
|
||||||
|
ingredient['amount'] = amount
|
||||||
|
ingredient['unit'] = ''
|
||||||
|
ingredient['name'] = rest
|
||||||
|
return ingredient
|
||||||
|
|
||||||
|
# Pattern 4: Fractional amounts "1/2 cup sugar"
|
||||||
|
fraction_pattern = r'^(\d+/\d+)\s+([a-zA-Z\s]+?)\s+(.+)$'
|
||||||
|
match4 = re.match(fraction_pattern, ingredient_text, re.IGNORECASE)
|
||||||
|
|
||||||
|
if match4:
|
||||||
|
amount = match4.group(1)
|
||||||
|
potential_unit = match4.group(2).strip().lower()
|
||||||
|
name = match4.group(3).strip()
|
||||||
|
|
||||||
|
if any(unit in potential_unit for unit in units):
|
||||||
|
ingredient['amount'] = amount
|
||||||
|
ingredient['unit'] = match4.group(2).strip()
|
||||||
|
ingredient['name'] = name
|
||||||
|
return ingredient
|
||||||
|
|
||||||
|
# If no patterns match, return as-is (some ingredients might not have amounts)
|
||||||
|
return ingredient
|
||||||
|
|
||||||
|
def scrape_recipe_details(self, recipe_url):
|
||||||
|
"""Scrape detailed recipe information from individual recipe page"""
|
||||||
|
try:
|
||||||
|
if self.use_selenium:
|
||||||
|
html_content = self.get_page_content_selenium(recipe_url)
|
||||||
|
else:
|
||||||
|
response = self.session.get(recipe_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
html_content = response.text
|
||||||
|
|
||||||
|
if not html_content:
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
recipe_data = {
|
||||||
|
'url': recipe_url,
|
||||||
|
'title': '',
|
||||||
|
'image_url': '',
|
||||||
|
'prep_time': '',
|
||||||
|
'serving': '',
|
||||||
|
'ingredients': [],
|
||||||
|
'description': '',
|
||||||
|
'steps': []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract title with more selectors
|
||||||
|
title_selectors = [
|
||||||
|
'h1',
|
||||||
|
'[data-test-id*="title"]',
|
||||||
|
'[data-testid*="title"]',
|
||||||
|
'.recipe-title',
|
||||||
|
'.title',
|
||||||
|
'title'
|
||||||
|
]
|
||||||
|
for selector in title_selectors:
|
||||||
|
title_elem = soup.select_one(selector)
|
||||||
|
if title_elem and title_elem.get_text(strip=True):
|
||||||
|
recipe_data['title'] = title_elem.get_text(strip=True)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract image URL with more selectors
|
||||||
|
img_selectors = [
|
||||||
|
'img[src*="recipe"]',
|
||||||
|
'img[alt*="recipe"]',
|
||||||
|
'.recipe-image img',
|
||||||
|
'.hero-image img',
|
||||||
|
'img[src*="gousto"]',
|
||||||
|
'main img',
|
||||||
|
'img[src*="cloudinary"]' # Common CDN for food images
|
||||||
|
]
|
||||||
|
for selector in img_selectors:
|
||||||
|
img_elem = soup.select_one(selector)
|
||||||
|
if img_elem:
|
||||||
|
src = img_elem.get('src') or img_elem.get('data-src') or img_elem.get('data-lazy')
|
||||||
|
if src and ('recipe' in src.lower() or 'gousto' in src.lower()):
|
||||||
|
if src.startswith('//'):
|
||||||
|
src = 'https:' + src
|
||||||
|
elif src.startswith('/'):
|
||||||
|
src = urljoin(self.base_url, src)
|
||||||
|
recipe_data['image_url'] = src
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract prep time
|
||||||
|
time_patterns = [
|
||||||
|
r'(\d+)\s*min',
|
||||||
|
r'(\d+)\s*hour',
|
||||||
|
r'(\d+)\s*hr'
|
||||||
|
]
|
||||||
|
|
||||||
|
time_text = soup.get_text()
|
||||||
|
for pattern in time_patterns:
|
||||||
|
match = re.search(pattern, time_text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
recipe_data['prep_time'] = match.group(0)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract serving information
|
||||||
|
serving_pattern = r'(\d+)\s*(?:serve|serving|portion|people)'
|
||||||
|
serving_match = re.search(serving_pattern, soup.get_text(), re.IGNORECASE)
|
||||||
|
if serving_match:
|
||||||
|
recipe_data['serving'] = serving_match.group(0)
|
||||||
|
|
||||||
|
# Extract ingredients and parse them into structured format
|
||||||
|
ingredient_selectors = [
|
||||||
|
'.ingredient',
|
||||||
|
'.ingredients li',
|
||||||
|
'[data-test-id*="ingredient"]',
|
||||||
|
'ul[class*="ingredient"] li',
|
||||||
|
'.recipe-ingredients li'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in ingredient_selectors:
|
||||||
|
ingredients = soup.select(selector)
|
||||||
|
if ingredients:
|
||||||
|
raw_ingredients = [ing.get_text(strip=True) for ing in ingredients if ing.get_text(strip=True)]
|
||||||
|
# Parse each ingredient into structured format
|
||||||
|
recipe_data['ingredients'] = [self.parse_ingredient(ing) for ing in raw_ingredients]
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract description
|
||||||
|
desc_selectors = [
|
||||||
|
'.description',
|
||||||
|
'.recipe-description',
|
||||||
|
'meta[name="description"]',
|
||||||
|
'.intro',
|
||||||
|
'p'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in desc_selectors:
|
||||||
|
if selector == 'meta[name="description"]':
|
||||||
|
desc_elem = soup.select_one(selector)
|
||||||
|
if desc_elem:
|
||||||
|
recipe_data['description'] = desc_elem.get('content', '').strip()
|
||||||
|
if recipe_data['description']:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
desc_elem = soup.select_one(selector)
|
||||||
|
if desc_elem and len(desc_elem.get_text(strip=True)) > 20:
|
||||||
|
recipe_data['description'] = desc_elem.get_text(strip=True)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract steps
|
||||||
|
step_selectors = [
|
||||||
|
'.step',
|
||||||
|
'.instruction',
|
||||||
|
'.method li',
|
||||||
|
'.directions li',
|
||||||
|
'ol li',
|
||||||
|
'[data-test-id*="step"]'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in step_selectors:
|
||||||
|
steps = soup.select(selector)
|
||||||
|
if steps:
|
||||||
|
raw_steps = [step.get_text(strip=True) for step in steps if step.get_text(strip=True)]
|
||||||
|
# Filter out unwanted steps (breadcrumbs, navigation, etc.)
|
||||||
|
recipe_data['steps'] = self.filter_steps(raw_steps, recipe_data.get('title', ''))
|
||||||
|
break
|
||||||
|
|
||||||
|
return recipe_data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error scraping recipe {recipe_url}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def scrape_recipes(self, start_url, max_recipes=10):
|
||||||
|
"""Main method to scrape recipes"""
|
||||||
|
print(f"Starting to scrape recipes from: {start_url}")
|
||||||
|
print(f"Using {'Selenium' if self.use_selenium else 'Requests'} for web scraping")
|
||||||
|
|
||||||
|
# Get recipe links from the main page
|
||||||
|
recipe_links = self.get_recipe_links(start_url)
|
||||||
|
|
||||||
|
if not recipe_links:
|
||||||
|
print("No recipe links found. Possible reasons:")
|
||||||
|
print("1. Website structure has changed")
|
||||||
|
print("2. Content is loaded dynamically and requires JavaScript")
|
||||||
|
print("3. Website is blocking automated requests")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Limit the number of recipes to scrape
|
||||||
|
recipe_links = recipe_links[:max_recipes]
|
||||||
|
|
||||||
|
recipes = []
|
||||||
|
|
||||||
|
for i, recipe_url in enumerate(recipe_links, 1):
|
||||||
|
print(f"\nScraping recipe {i}/{len(recipe_links)}: {recipe_url}")
|
||||||
|
|
||||||
|
recipe_data = self.scrape_recipe_details(recipe_url)
|
||||||
|
|
||||||
|
if recipe_data and recipe_data['title']:
|
||||||
|
recipes.append(recipe_data)
|
||||||
|
print(f"✓ Successfully scraped: {recipe_data['title']}")
|
||||||
|
else:
|
||||||
|
print(f"✗ Failed to scrape recipe: {recipe_url}")
|
||||||
|
|
||||||
|
# Be respectful - add delay between requests
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
return recipes
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
"""Clean up Selenium driver"""
|
||||||
|
if hasattr(self, 'driver'):
|
||||||
|
try:
|
||||||
|
self.driver.quit()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Option 1: Try with Selenium first (handles JavaScript), fallback to requests
|
||||||
|
print("=== Gousto Recipe Scraper ===")
|
||||||
|
print("Choose scraping method:")
|
||||||
|
print("1. Selenium (recommended - handles JavaScript)")
|
||||||
|
print("2. Requests only (faster, but may miss content)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
choice = input("Enter choice (1 or 2, default=1): ").strip()
|
||||||
|
if choice == '2':
|
||||||
|
use_selenium = False
|
||||||
|
print("Using requests-only method...")
|
||||||
|
else:
|
||||||
|
use_selenium = True
|
||||||
|
print("Attempting to use Selenium...")
|
||||||
|
except (EOFError, KeyboardInterrupt):
|
||||||
|
# Handle case where input() isn't available (like in some environments)
|
||||||
|
use_selenium = True
|
||||||
|
print("Defaulting to Selenium method...")
|
||||||
|
|
||||||
|
scraper = GoustoScraper(use_selenium=use_selenium)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# URL to scrape
|
||||||
|
start_url = "https://www.gousto.co.uk/cookbook/recipes?page=1"
|
||||||
|
|
||||||
|
# Scrape recipes (limit to 5 for testing)
|
||||||
|
recipes = scraper.scrape_recipes(start_url, max_recipes=5)
|
||||||
|
|
||||||
|
# Save to JSON file
|
||||||
|
if recipes:
|
||||||
|
with open('gousto_recipes.json', 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(recipes, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
print(f"\n🎉 Successfully scraped {len(recipes)} recipes!")
|
||||||
|
print("Data saved to 'gousto_recipes.json'")
|
||||||
|
|
||||||
|
# Print sample data with structured ingredients
|
||||||
|
if recipes:
|
||||||
|
print("\nSample recipe data:")
|
||||||
|
sample = recipes[0]
|
||||||
|
print(f"Title: {sample['title']}")
|
||||||
|
print(f"Image: {sample['image_url']}")
|
||||||
|
print(f"Prep Time: {sample['prep_time']}")
|
||||||
|
print(f"Serving: {sample['serving']}")
|
||||||
|
print(f"Ingredients: {len(sample['ingredients'])} items")
|
||||||
|
|
||||||
|
# Show first few ingredients in structured format
|
||||||
|
if sample['ingredients']:
|
||||||
|
print("Sample ingredients:")
|
||||||
|
for i, ing in enumerate(sample['ingredients'][:3]):
|
||||||
|
print(f" {i+1}. {ing['amount']} {ing['unit']} {ing['name']}".strip())
|
||||||
|
|
||||||
|
print(f"Steps: {len(sample['steps'])} steps")
|
||||||
|
else:
|
||||||
|
print("\n❌ No recipes were successfully scraped.")
|
||||||
|
print("This might be due to:")
|
||||||
|
print("- Website changes")
|
||||||
|
print("- Anti-bot measures")
|
||||||
|
print("- Network issues")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Clean up
|
||||||
|
del scraper
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user