Chapter 39: Web Scraping
The web is the largest database ever built. Billions of pages of prices, news, job listings, sports scores, research data — most of it accessible to anyone with a browser. Web scraping is the skill of collecting that data automatically, at scale, with Python.
This chapter teaches you to scrape the right way: respectfully, robustly, and legally.
How the Web Works (What You Need to Know)
When you visit a website, your browser:
- Sends an HTTP
GETrequest to the server - Receives HTML — plain text that describes the page structure
- Parses that HTML and renders it visually
Web scraping replaces steps 1 and 3 with Python. You send the same HTTP request and parse the HTML yourself — extracting just the data you need.
Browser: GET https://example.com/products
Server: -> 200 OK + HTML page
Browser: Parse HTML -> render page visually
Python: GET https://example.com/products
Server: -> 200 OK + HTML page
Python: Parse HTML -> extract product names and prices
Before You Scrape — Check These Things
robots.txt— Every site hashttps://site.com/robots.txt. It specifies which paths bots may visit. Respect it.- Terms of Service — Some sites explicitly prohibit scraping. Read the ToS.
- Rate limits — Don't hammer servers. Add delays between requests.
- APIs first — Many sites have official APIs that are easier, more reliable, and explicitly allowed.
import requests
# Always check robots.txt first
r = requests.get("https://books.toscrape.com/robots.txt")
print(r.text)
requests — Fetching Pages
pip install requests beautifulsoup4 lxml
import requests
# Basic GET request
response = requests.get("https://books.toscrape.com/")
print(response.status_code) # 200
print(response.headers["Content-Type"]) # text/html; charset=utf-8
print(len(response.text)) # length of HTML
Setting headers to look like a browser
Some servers block requests that don't look like a browser. Add a User-Agent header:
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
}
response = requests.get("https://books.toscrape.com/", headers=headers)
Handling errors
import requests
from requests.exceptions import RequestException
def fetch_page(url: str, timeout: int = 10) -> str | None:
"""Fetch a URL and return its HTML, or None on failure."""
try:
response = requests.get(url, timeout=timeout, headers={
"User-Agent": "MyBot/1.0 (educational purposes)"
})
response.raise_for_status() # raises HTTPError for 4xx/5xx
return response.text
except requests.HTTPError as e:
print(f"HTTP error {e.response.status_code}: {url}")
except requests.ConnectionError:
print(f"Cannot connect: {url}")
except requests.Timeout:
print(f"Timeout: {url}")
except RequestException as e:
print(f"Request failed: {e}")
return None
BeautifulSoup — Parsing HTML
BeautifulSoup parses HTML and lets you navigate and search the element tree:
from bs4 import BeautifulSoup
html = """
<html>
<body>
<h1 class="title">Python Books</h1>
<ul id="book-list">
<li class="book" data-price="29.99">
<a href="/book/1">Learn Python</a>
</li>
<li class="book" data-price="39.99">
<a href="/book/2">Python Tricks</a>
</li>
</ul>
</body>
</html>
"""
soup = BeautifulSoup(html, "lxml")
Finding elements
# Find the first matching element
h1 = soup.find("h1")
print(h1.text) # Python Books
print(h1["class"]) # ['title']
# Find all matching elements
books = soup.find_all("li", class_="book")
print(len(books)) # 2
# CSS selectors (very powerful)
books = soup.select("ul#book-list li.book")
titles = soup.select("li.book a")
first = soup.select_one("li.book")
# Navigate the tree
ul = soup.find("ul")
first_li = ul.find("li") # first child li
all_li = ul.find_all("li") # all child lis
parent = first_li.parent # the ul element
siblings = list(first_li.next_siblings)
Extracting data
for book in soup.find_all("li", class_="book"):
# Text content
title = book.find("a").text.strip()
# Attributes
price = book["data-price"]
href = book.find("a")["href"]
print(f"{title}: ${price} — {href}")
Output:
Learn Python: $29.99 — /book/1
Python Tricks: $39.99 — /book/2
A Real Scraper — Books to Scrape
books.toscrape.com is a practice site built specifically for learning web scraping. Let's scrape its book catalogue:
import requests
import time
from bs4 import BeautifulSoup
from dataclasses import dataclass
@dataclass
class Book:
title: str
price: float
rating: int
url: str
RATING_MAP = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
BASE_URL = "https://books.toscrape.com"
def parse_book(article) -> Book:
"""Extract a Book from a catalogue <article> element."""
title = article.find("h3").find("a")["title"]
price = float(article.find("p", class_="price_color").text.strip("£Â"))
rating = RATING_MAP.get(article.find("p", class_="star-rating")["class"][1], 0)
href = article.find("h3").find("a")["href"]
url = f"{BASE_URL}/catalogue/{href.replace('../', '')}"
return Book(title=title, price=price, rating=rating, url=url)
def scrape_page(url: str) -> tuple[list[Book], str | None]:
"""
Scrape one catalogue page.
Returns (books_on_page, next_page_url or None).
"""
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
books = [parse_book(a) for a in soup.select("article.product_pod")]
# Get the "next" button URL if it exists
next_btn = soup.select_one("li.next a")
next_url = None
if next_btn:
current_path = url.rsplit("/", 1)[0]
next_url = f"{current_path}/{next_btn['href']}"
return books, next_url
def scrape_catalogue(
start_url: str = f"{BASE_URL}/catalogue/page-1.html",
max_pages: int = 5,
delay: float = 1.0,
) -> list[Book]:
"""Scrape up to max_pages of the catalogue."""
all_books = []
url = start_url
page = 1
while url and page <= max_pages:
print(f"Scraping page {page}: {url}")
books, next_url = scrape_page(url)
all_books.extend(books)
print(f" Found {len(books)} books (total: {len(all_books)})")
url = next_url
page += 1
if url:
time.sleep(delay) # be polite — don't hammer the server
return all_books
def print_summary(books: list[Book]) -> None:
if not books:
print("No books found.")
return
prices = [b.price for b in books]
ratings = [b.rating for b in books]
print(f"\n{'─'*50}")
print(f"Total books: {len(books)}")
print(f"Avg price: £{sum(prices)/len(prices):.2f}")
print(f"Cheapest: £{min(prices):.2f}")
print(f"Most expensive: £{max(prices):.2f}")
print(f"Avg rating: {sum(ratings)/len(ratings):.1f}/5")
print("\nTop 5 highest-rated books:")
top = sorted(books, key=lambda b: (-b.rating, b.price))[:5]
for book in top:
print(f" {'[*]'*book.rating:<5} £{book.price:.2f} {book.title[:50]}")
if __name__ == "__main__":
books = scrape_catalogue(max_pages=3)
print_summary(books)
Output:
Scraping page 1: https://books.toscrape.com/catalogue/page-1.html
Found 20 books (total: 20)
Scraping page 2: ...
Found 20 books (total: 40)
Scraping page 3: ...
Found 20 books (total: 60)
──────────────────────────────────────────────────
Total books: 60
Avg price: £35.14
Cheapest: £10.00
Most expensive: £59.69
Avg rating: 2.9/5
Top 5 highest-rated books:
[*][*][*][*][*] £17.46 It's Only the Himalayas
[*][*][*][*][*] £45.17 Full Moon over Noah's Ark
...
Handling Pagination
The pattern above — extract data, find the "next" link, repeat — handles any paginated site:
def paginate(start_url: str, get_next_url, scrape_page_fn, max_pages=None):
"""
Generic pagination driver.
Args:
start_url: First page URL.
get_next_url: soup -> next URL or None.
scrape_page_fn:soup -> list of items.
max_pages: Stop after this many pages (None = all).
"""
url = start_url
page = 0
items = []
while url:
page += 1
if max_pages and page > max_pages:
break
response = requests.get(url, timeout=10)
soup = BeautifulSoup(response.text, "lxml")
page_items = scrape_page_fn(soup)
items.extend(page_items)
url = get_next_url(soup)
if url:
time.sleep(1.0)
return items
Storing Scraped Data
import csv
import json
from pathlib import Path
from dataclasses import asdict
def save_to_csv(books: list[Book], path: str = "books.csv") -> None:
with open(path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["title", "price", "rating", "url"])
writer.writeheader()
writer.writerows(asdict(b) for b in books)
print(f"Saved {len(books)} books to {path}")
def save_to_json(books: list[Book], path: str = "books.json") -> None:
data = [asdict(b) for b in books]
Path(path).write_text(json.dumps(data, indent=2), encoding="utf-8")
print(f"Saved {len(books)} books to {path}")
def save_to_sqlite(books: list[Book], db_path: str = "books.db") -> None:
import sqlite3
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS books (
title TEXT,
price REAL,
rating INTEGER,
url TEXT UNIQUE
)
""")
conn.executemany(
"INSERT OR IGNORE INTO books VALUES (?, ?, ?, ?)",
[(b.title, b.price, b.rating, b.url) for b in books]
)
conn.commit()
conn.close()
print(f"Saved {len(books)} books to {db_path}")
Scraping with Sessions and Cookies
Some sites require you to log in first. Use a requests.Session to persist cookies:
import requests
session = requests.Session()
session.headers.update({
"User-Agent": "Mozilla/5.0 ..."
})
# Log in
login_response = session.post("https://example.com/login", data={
"username": "myuser",
"password": "mypassword",
})
login_response.raise_for_status()
# All subsequent requests use the session (cookies maintained)
page = session.get("https://example.com/dashboard")
Dynamic Pages with Selenium
Some sites load content with JavaScript — the initial HTML is nearly empty, and a script populates it after the page loads. requests + BeautifulSoup can't handle this. You need a real browser.
pip install selenium
# Also download ChromeDriver: https://chromedriver.chromium.org/
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
# Run headless — no visible browser window
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)
try:
driver.get("https://quotes.toscrape.com/js/")
# Wait for the quotes to load (up to 10 seconds)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "quote"))
)
quotes = driver.find_elements(By.CLASS_NAME, "quote")
for quote in quotes:
text = quote.find_element(By.CLASS_NAME, "text").text
author = quote.find_element(By.CLASS_NAME, "author").text
print(f"{author}: {text[:60]}")
finally:
driver.quit() # always close the browser
Use Selenium only when necessary — it's slower and more complex than requests + BeautifulSoup. For most sites, static scraping is enough.
A lighter alternative is playwright:
pip install playwright
playwright install chromium
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto("https://quotes.toscrape.com/js/")
page.wait_for_selector(".quote")
quotes = page.query_selector_all(".quote")
for q in quotes:
text = q.query_selector(".text").inner_text()
author = q.query_selector(".author").inner_text()
print(f"{author}: {text[:60]}")
browser.close()
Rate Limiting and Politeness
A good scraper is a polite scraper:
import time
import random
def polite_get(url: str, min_delay: float = 1.0, max_delay: float = 3.0) -> requests.Response:
"""Fetch a URL with a random delay to avoid overwhelming the server."""
time.sleep(random.uniform(min_delay, max_delay))
return requests.get(url, timeout=10, headers={
"User-Agent": "ResearchBot/1.0 (contact: you@example.com)"
})
Rules to follow:
- Delay between requests. 1-3 seconds is reasonable. Never fire requests as fast as possible.
- Identify your bot. Put a contact email in the
User-Agentso site owners can reach you. - Honor
Retry-Afterheaders. If the server says "wait 60 seconds," wait 60 seconds. - Cache responses. Don't re-fetch pages you already have.
- Scrape off-peak hours. Less load on their servers.
Project: Job Listing Scraper
"""
job_scraper.py — Scrape job listings from a practice site.
Uses: https://realpython.github.io/fake-jobs/
"""
import csv
import time
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass, asdict, field
from datetime import datetime
from pathlib import Path
BASE_URL = "https://realpython.github.io/fake-jobs/"
@dataclass
class Job:
title: str
company: str
location: str
date: str
url: str = ""
scraped_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
def scrape_jobs(url: str = BASE_URL) -> list[Job]:
"""Scrape all job listings from the fake-jobs site."""
print(f"Fetching: {url}")
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
jobs = []
for card in soup.select("div.card"):
title = card.select_one("h2.title").text.strip()
company = card.select_one("h3.subtitle").text.strip()
location = card.select_one("p.location").text.strip()
date = card.select_one("time")["datetime"] if card.select_one("time") else ""
link = card.select_one("a[href]")
job_url = link["href"] if link else ""
jobs.append(Job(
title = title,
company = company,
location = location,
date = date,
url = job_url,
))
print(f"Found {len(jobs)} jobs")
return jobs
def filter_jobs(
jobs: list[Job],
keywords: list[str] = (),
location: str = "",
) -> list[Job]:
"""Filter jobs by keyword in title or location."""
result = jobs
if keywords:
kw_lower = [k.lower() for k in keywords]
result = [j for j in result
if any(k in j.title.lower() for k in kw_lower)]
if location:
result = [j for j in result
if location.lower() in j.location.lower()]
return result
def save_jobs(jobs: list[Job], path: str = "jobs.csv") -> None:
with open(path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=list(Job.__dataclass_fields__))
writer.writeheader()
writer.writerows(asdict(j) for j in jobs)
print(f"Saved to {path}")
if __name__ == "__main__":
all_jobs = scrape_jobs()
# Filter for Python-related jobs
python_jobs = filter_jobs(all_jobs, keywords=["python", "django", "flask"])
print(f"\nPython-related jobs: {len(python_jobs)}")
for job in python_jobs[:5]:
print(f" {job.title} at {job.company} — {job.location}")
save_jobs(all_jobs, "all_jobs.csv")
save_jobs(python_jobs, "python_jobs.csv")
What You Learned in This Chapter
- Web scraping replaces your browser — Python sends HTTP requests and parses the HTML response.
- Always check
robots.txtand the site's Terms of Service before scraping. requests.get(url)fetches a page..raise_for_status()raises on HTTP errors. Use aUser-Agentheader.BeautifulSoup(html, "lxml")parses HTML..find(),.find_all(),.select(),.select_one()navigate the tree..textextracts text.["attr"]extracts attributes.- Paginate by finding the "next" link and looping.
- Store scraped data in CSV, JSON, or SQLite.
- Use
requests.Sessionto persist cookies across requests (login sessions). - Use
seleniumorplaywrightfor JavaScript-rendered pages. - Be polite: add delays, identify your bot, respect
robots.txt.
What's Next?
Chapter 40 covers Web Development with Flask — building a full server-side web application with routes, HTML templates (Jinja2), forms, and a database. You've scraped data off the web; now you'll serve your own.