Chapter 39: Web Scraping

The web is the largest database ever built. Billions of pages of prices, news, job listings, sports scores, research data — most of it accessible to anyone with a browser. Web scraping is the skill of collecting that data automatically, at scale, with Python.

This chapter teaches you to scrape the right way: respectfully, robustly, and legally.

How the Web Works (What You Need to Know)

When you visit a website, your browser:

Sends an HTTP GET request to the server
Receives HTML — plain text that describes the page structure
Parses that HTML and renders it visually

Web scraping replaces steps 1 and 3 with Python. You send the same HTTP request and parse the HTML yourself — extracting just the data you need.

Browser:  GET https://example.com/products
Server:   -> 200 OK + HTML page
Browser:  Parse HTML -> render page visually

Python:   GET https://example.com/products
Server:   -> 200 OK + HTML page
Python:   Parse HTML -> extract product names and prices

Before You Scrape — Check These Things

robots.txt — Every site has https://site.com/robots.txt. It specifies which paths bots may visit. Respect it.
Terms of Service — Some sites explicitly prohibit scraping. Read the ToS.
Rate limits — Don't hammer servers. Add delays between requests.
APIs first — Many sites have official APIs that are easier, more reliable, and explicitly allowed.

import requests

# Always check robots.txt first
r = requests.get("https://books.toscrape.com/robots.txt")
print(r.text)

`requests` — Fetching Pages

pip install requests beautifulsoup4 lxml

import requests

# Basic GET request
response = requests.get("https://books.toscrape.com/")

print(response.status_code)    # 200
print(response.headers["Content-Type"])  # text/html; charset=utf-8
print(len(response.text))      # length of HTML

Setting headers to look like a browser

Some servers block requests that don't look like a browser. Add a User-Agent header:

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

response = requests.get("https://books.toscrape.com/", headers=headers)

Handling errors

import requests
from requests.exceptions import RequestException

def fetch_page(url: str, timeout: int = 10) -> str | None:
    """Fetch a URL and return its HTML, or None on failure."""
    try:
        response = requests.get(url, timeout=timeout, headers={
            "User-Agent": "MyBot/1.0 (educational purposes)"
        })
        response.raise_for_status()   # raises HTTPError for 4xx/5xx
        return response.text
    except requests.HTTPError as e:
        print(f"HTTP error {e.response.status_code}: {url}")
    except requests.ConnectionError:
        print(f"Cannot connect: {url}")
    except requests.Timeout:
        print(f"Timeout: {url}")
    except RequestException as e:
        print(f"Request failed: {e}")
    return None

`BeautifulSoup` — Parsing HTML

BeautifulSoup parses HTML and lets you navigate and search the element tree:

from bs4 import BeautifulSoup

html = """
<html>
  <body>
    <h1 class="title">Python Books</h1>
    <ul id="book-list">
      <li class="book" data-price="29.99">
        <a href="/book/1">Learn Python</a>
      </li>
      <li class="book" data-price="39.99">
        <a href="/book/2">Python Tricks</a>
      </li>
    </ul>
  </body>
</html>
"""

soup = BeautifulSoup(html, "lxml")

Finding elements

# Find the first matching element
h1 = soup.find("h1")
print(h1.text)              # Python Books
print(h1["class"])          # ['title']

# Find all matching elements
books = soup.find_all("li", class_="book")
print(len(books))           # 2

# CSS selectors (very powerful)
books = soup.select("ul#book-list li.book")
titles = soup.select("li.book a")
first  = soup.select_one("li.book")

# Navigate the tree
ul   = soup.find("ul")
first_li = ul.find("li")       # first child li
all_li   = ul.find_all("li")   # all child lis
parent   = first_li.parent     # the ul element
siblings = list(first_li.next_siblings)

Extracting data

for book in soup.find_all("li", class_="book"):
    # Text content
    title = book.find("a").text.strip()

    # Attributes
    price = book["data-price"]
    href  = book.find("a")["href"]

    print(f"{title}: ${price} — {href}")

Output:

Learn Python: $29.99 — /book/1
Python Tricks: $39.99 — /book/2

A Real Scraper — Books to Scrape

books.toscrape.com is a practice site built specifically for learning web scraping. Let's scrape its book catalogue:

import requests
import time
from bs4 import BeautifulSoup
from dataclasses import dataclass


@dataclass
class Book:
    title:  str
    price:  float
    rating: int
    url:    str


RATING_MAP = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
BASE_URL   = "https://books.toscrape.com"


def parse_book(article) -> Book:
    """Extract a Book from a catalogue <article> element."""
    title  = article.find("h3").find("a")["title"]
    price  = float(article.find("p", class_="price_color").text.strip("£Â"))
    rating = RATING_MAP.get(article.find("p", class_="star-rating")["class"][1], 0)
    href   = article.find("h3").find("a")["href"]
    url    = f"{BASE_URL}/catalogue/{href.replace('../', '')}"
    return Book(title=title, price=price, rating=rating, url=url)


def scrape_page(url: str) -> tuple[list[Book], str | None]:
    """
    Scrape one catalogue page.
    Returns (books_on_page, next_page_url or None).
    """
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "lxml")

    books = [parse_book(a) for a in soup.select("article.product_pod")]

    # Get the "next" button URL if it exists
    next_btn = soup.select_one("li.next a")
    next_url = None
    if next_btn:
        current_path = url.rsplit("/", 1)[0]
        next_url = f"{current_path}/{next_btn['href']}"

    return books, next_url


def scrape_catalogue(
    start_url: str = f"{BASE_URL}/catalogue/page-1.html",
    max_pages:  int = 5,
    delay:      float = 1.0,
) -> list[Book]:
    """Scrape up to max_pages of the catalogue."""
    all_books = []
    url       = start_url
    page      = 1

    while url and page <= max_pages:
        print(f"Scraping page {page}: {url}")
        books, next_url = scrape_page(url)
        all_books.extend(books)
        print(f"  Found {len(books)} books (total: {len(all_books)})")

        url   = next_url
        page += 1
        if url:
            time.sleep(delay)   # be polite — don't hammer the server

    return all_books


def print_summary(books: list[Book]) -> None:
    if not books:
        print("No books found.")
        return

    prices  = [b.price for b in books]
    ratings = [b.rating for b in books]

    print(f"\n{'─'*50}")
    print(f"Total books:    {len(books)}")
    print(f"Avg price:      £{sum(prices)/len(prices):.2f}")
    print(f"Cheapest:       £{min(prices):.2f}")
    print(f"Most expensive: £{max(prices):.2f}")
    print(f"Avg rating:     {sum(ratings)/len(ratings):.1f}/5")

    print("\nTop 5 highest-rated books:")
    top = sorted(books, key=lambda b: (-b.rating, b.price))[:5]
    for book in top:
        print(f"  {'[*]'*book.rating:<5} £{book.price:.2f}  {book.title[:50]}")


if __name__ == "__main__":
    books = scrape_catalogue(max_pages=3)
    print_summary(books)

Output:

Scraping page 1: https://books.toscrape.com/catalogue/page-1.html
  Found 20 books (total: 20)
Scraping page 2: ...
  Found 20 books (total: 40)
Scraping page 3: ...
  Found 20 books (total: 60)

──────────────────────────────────────────────────
Total books:    60
Avg price:      £35.14
Cheapest:       £10.00
Most expensive: £59.69
Avg rating:     2.9/5

Top 5 highest-rated books:
  [*][*][*][*][*] £17.46  It's Only the Himalayas
  [*][*][*][*][*] £45.17  Full Moon over Noah's Ark
  ...

Handling Pagination

The pattern above — extract data, find the "next" link, repeat — handles any paginated site:

def paginate(start_url: str, get_next_url, scrape_page_fn, max_pages=None):
    """
    Generic pagination driver.

    Args:
        start_url:     First page URL.
        get_next_url:  soup -> next URL or None.
        scrape_page_fn:soup -> list of items.
        max_pages:     Stop after this many pages (None = all).
    """
    url    = start_url
    page   = 0
    items  = []

    while url:
        page += 1
        if max_pages and page > max_pages:
            break

        response = requests.get(url, timeout=10)
        soup     = BeautifulSoup(response.text, "lxml")

        page_items = scrape_page_fn(soup)
        items.extend(page_items)

        url = get_next_url(soup)
        if url:
            time.sleep(1.0)

    return items

Storing Scraped Data

import csv
import json
from pathlib import Path
from dataclasses import asdict

def save_to_csv(books: list[Book], path: str = "books.csv") -> None:
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "price", "rating", "url"])
        writer.writeheader()
        writer.writerows(asdict(b) for b in books)
    print(f"Saved {len(books)} books to {path}")


def save_to_json(books: list[Book], path: str = "books.json") -> None:
    data = [asdict(b) for b in books]
    Path(path).write_text(json.dumps(data, indent=2), encoding="utf-8")
    print(f"Saved {len(books)} books to {path}")


def save_to_sqlite(books: list[Book], db_path: str = "books.db") -> None:
    import sqlite3
    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS books (
            title  TEXT,
            price  REAL,
            rating INTEGER,
            url    TEXT UNIQUE
        )
    """)
    conn.executemany(
        "INSERT OR IGNORE INTO books VALUES (?, ?, ?, ?)",
        [(b.title, b.price, b.rating, b.url) for b in books]
    )
    conn.commit()
    conn.close()
    print(f"Saved {len(books)} books to {db_path}")

Scraping with Sessions and Cookies

Some sites require you to log in first. Use a requests.Session to persist cookies:

import requests

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 ..."
})

# Log in
login_response = session.post("https://example.com/login", data={
    "username": "myuser",
    "password": "mypassword",
})
login_response.raise_for_status()

# All subsequent requests use the session (cookies maintained)
page = session.get("https://example.com/dashboard")

Dynamic Pages with Selenium

Some sites load content with JavaScript — the initial HTML is nearly empty, and a script populates it after the page loads. requests + BeautifulSoup can't handle this. You need a real browser.

pip install selenium
# Also download ChromeDriver: https://chromedriver.chromium.org/

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time

# Run headless — no visible browser window
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=options)

try:
    driver.get("https://quotes.toscrape.com/js/")

    # Wait for the quotes to load (up to 10 seconds)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "quote"))
    )

    quotes = driver.find_elements(By.CLASS_NAME, "quote")
    for quote in quotes:
        text   = quote.find_element(By.CLASS_NAME, "text").text
        author = quote.find_element(By.CLASS_NAME, "author").text
        print(f"{author}: {text[:60]}")

finally:
    driver.quit()   # always close the browser

Use Selenium only when necessary — it's slower and more complex than requests + BeautifulSoup. For most sites, static scraping is enough.

A lighter alternative is playwright:

pip install playwright
playwright install chromium

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch(headless=True)
    page    = browser.new_page()
    page.goto("https://quotes.toscrape.com/js/")
    page.wait_for_selector(".quote")

    quotes = page.query_selector_all(".quote")
    for q in quotes:
        text   = q.query_selector(".text").inner_text()
        author = q.query_selector(".author").inner_text()
        print(f"{author}: {text[:60]}")

    browser.close()

Rate Limiting and Politeness

A good scraper is a polite scraper:

import time
import random

def polite_get(url: str, min_delay: float = 1.0, max_delay: float = 3.0) -> requests.Response:
    """Fetch a URL with a random delay to avoid overwhelming the server."""
    time.sleep(random.uniform(min_delay, max_delay))
    return requests.get(url, timeout=10, headers={
        "User-Agent": "ResearchBot/1.0 (contact: you@example.com)"
    })

Rules to follow:

Delay between requests. 1-3 seconds is reasonable. Never fire requests as fast as possible.
Identify your bot. Put a contact email in the User-Agent so site owners can reach you.
Honor Retry-After headers. If the server says "wait 60 seconds," wait 60 seconds.
Cache responses. Don't re-fetch pages you already have.
Scrape off-peak hours. Less load on their servers.

Project: Job Listing Scraper

"""
job_scraper.py — Scrape job listings from a practice site.
Uses: https://realpython.github.io/fake-jobs/
"""
import csv
import time
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass, asdict, field
from datetime import datetime
from pathlib import Path


BASE_URL = "https://realpython.github.io/fake-jobs/"


@dataclass
class Job:
    title:    str
    company:  str
    location: str
    date:     str
    url:      str = ""
    scraped_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())


def scrape_jobs(url: str = BASE_URL) -> list[Job]:
    """Scrape all job listings from the fake-jobs site."""
    print(f"Fetching: {url}")
    response = requests.get(url, timeout=10)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "lxml")
    jobs = []

    for card in soup.select("div.card"):
        title    = card.select_one("h2.title").text.strip()
        company  = card.select_one("h3.subtitle").text.strip()
        location = card.select_one("p.location").text.strip()
        date     = card.select_one("time")["datetime"] if card.select_one("time") else ""
        link     = card.select_one("a[href]")
        job_url  = link["href"] if link else ""

        jobs.append(Job(
            title    = title,
            company  = company,
            location = location,
            date     = date,
            url      = job_url,
        ))

    print(f"Found {len(jobs)} jobs")
    return jobs


def filter_jobs(
    jobs:     list[Job],
    keywords: list[str] = (),
    location: str = "",
) -> list[Job]:
    """Filter jobs by keyword in title or location."""
    result = jobs
    if keywords:
        kw_lower = [k.lower() for k in keywords]
        result   = [j for j in result
                    if any(k in j.title.lower() for k in kw_lower)]
    if location:
        result   = [j for j in result
                    if location.lower() in j.location.lower()]
    return result


def save_jobs(jobs: list[Job], path: str = "jobs.csv") -> None:
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=list(Job.__dataclass_fields__))
        writer.writeheader()
        writer.writerows(asdict(j) for j in jobs)
    print(f"Saved to {path}")


if __name__ == "__main__":
    all_jobs = scrape_jobs()

    # Filter for Python-related jobs
    python_jobs = filter_jobs(all_jobs, keywords=["python", "django", "flask"])
    print(f"\nPython-related jobs: {len(python_jobs)}")
    for job in python_jobs[:5]:
        print(f"  {job.title} at {job.company} — {job.location}")

    save_jobs(all_jobs, "all_jobs.csv")
    save_jobs(python_jobs, "python_jobs.csv")

What You Learned in This Chapter

Web scraping replaces your browser — Python sends HTTP requests and parses the HTML response.
Always check robots.txt and the site's Terms of Service before scraping.
requests.get(url) fetches a page. .raise_for_status() raises on HTTP errors. Use a User-Agent header.
BeautifulSoup(html, "lxml") parses HTML. .find(), .find_all(), .select(), .select_one() navigate the tree. .text extracts text. ["attr"] extracts attributes.
Paginate by finding the "next" link and looping.
Store scraped data in CSV, JSON, or SQLite.
Use requests.Session to persist cookies across requests (login sessions).
Use selenium or playwright for JavaScript-rendered pages.
Be polite: add delays, identify your bot, respect robots.txt.

What's Next?

Chapter 40 covers Web Development with Flask — building a full server-side web application with routes, HTML templates (Jinja2), forms, and a database. You've scraped data off the web; now you'll serve your own.