import time
import requests
from bs4 import BeautifulSoup
session = requests.Session()
session.headers.update({'User-Agent': 'research-bot/1.0'})
def fetch_article(url: str) -> dict:
response = session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.select_one('h1').get_text(strip=True)
body = ' '.join(node.get_text(' ', strip=True) for node in soup.select('article p'))
return {'url': url, 'title': title, 'body': body}
records = []
for url in urls:
records.append(fetch_article(url))
time.sleep(1)
For lightweight data collection, I prefer reliable HTML parsing over brittle browser automation. That means stable headers, polite rate limiting, retries, and explicit extraction rules. If scraping becomes core infrastructure, then I graduate it into a monitored job instead of leaving it as a notebook trick.