Advanced Configuration and Performance Tuning for Residential Proxies in the Scrapy Framework
Source: Dev.to
When using Scrapy for large‑scale, high‑frequency data scraping, simple proxy settings quickly become insufficient. Random IP rotation and fixed delays can cause inefficiency, waste IPs, and trigger sophisticated anti‑bot mechanisms. Deep integration of residential proxies with Scrapy, combined with performance tuning, is essential for building industrial‑grade, robust, and efficient data pipelines.
Recommended Architecture: Extensible Proxy Pool Middleware System
Scrapy Request
↓
[Residential Proxy Middleware] ←→ [External Proxy Pool Manager]
| |
| (Acquire/Release Proxy) | (Manage IP Health Status,
| | Implement Smart Rotation)
↓ ↓
[Target Website] [Rapidproxy API / Dashboard]
Decoupling proxy acquisition logic from request processing makes proxy management more flexible and intelligent.
Middleware Implementation
# middlewares.py
import random
import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured
from your_project.proxy_pool import ProxyPoolClient # Hypothetical proxy pool client
class AdvancedResidentialProxyMiddleware:
def __init__(self, proxy_pool_client):
self.proxy_pool = proxy_pool_client
self.logger = logging.getLogger(__name__)
self.stats = {}
@classmethod
def from_crawler(cls, crawler):
pool_config = crawler.settings.getdict('RESIDENTIAL_PROXY_POOL')
if not pool_config:
raise NotConfigured('RESIDENTIAL_PROXY_POOL not configured')
proxy_pool = ProxyPoolClient(**pool_config)
middleware = cls(proxy_pool)
crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
return middleware
def process_request(self, request, spider):
if 'proxy' in request.meta:
return
target_domain = request.url.split('/')[2]
proxy_strategy = self._select_strategy(target_domain, spider)
proxy = self.proxy_pool.acquire_proxy(strategy=proxy_strategy)
if not proxy:
self.logger.error(f"No available proxy for {target_domain}")
raise Exception("ProxyPoolExhausted")
request.meta['proxy'] = proxy['endpoint']
request.meta['proxy_meta'] = proxy
if proxy.get('auth'):
request.headers['Proxy-Authorization'] = proxy['auth']
proxy_key = proxy['id']
self.stats[proxy_key] = self.stats.get(proxy_key, 0) + 1
def process_response(self, request, response, spider):
proxy_meta = request.meta.get('proxy_meta')
if proxy_meta:
self.proxy_pool.report_success(proxy_meta['id'])
return response
def process_exception(self, request, exception, spider):
proxy_meta = request.meta.get('proxy_meta')
if proxy_meta:
self.proxy_pool.report_failure(proxy_meta['id'])
return None
def spider_closed(self, spider, reason):
self.logger.info(f"Proxy usage statistics: {self.stats}")
self.proxy_pool.cleanup()
def _select_strategy(self, domain, spider):
"""Select proxy strategy based on target domain"""
domain_strategies = spider.settings.get('DOMAIN_PROXY_STRATEGIES', {})
if domain in domain_strategies:
return domain_strategies[domain]
return {
'strategy': 'random',
'location': 'global',
'session_ttl': random.randint(30, 300)
}
Scrapy Settings
# settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'your_project.middlewares.AdvancedResidentialProxyMiddleware': 100,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
}
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 2
DOWNLOAD_DELAY = 0.5
RANDOMIZE_DOWNLOAD_DELAY = True
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1.0
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429, 403]
RESIDENTIAL_PROXY_POOL = {
'api_endpoint': 'https://api.rapidproxy.io/v1/pool',
'api_key': 'your_api_key_here', # Prefer reading from env var
'default_location': 'us',
'max_ip_per_domain': 3,
'health_check_interval': 60,
}
DOMAIN_PROXY_STRATEGIES = {
'amazon.com': {'location': 'us', 'strategy': 'sticky', 'session_ttl': 600},
'taobao.com': {'location': 'cn', 'strategy': 'rotate', 'rotate_interval': 30},
'example.co.uk': {'location': 'gb', 'strategy': 'random'},
}
DOWNLOAD_TIMEOUT = 30
Adaptive Concurrency Extension
# extensions.py
from scrapy import signals
from scrapy.exceptions import NotConfigured
class AdaptiveConcurrencyExtension:
def __init__(self, crawler):
self.crawler = crawler
self.success_rate = 1.0
self.min_concurrency = 1
self.max_concurrency = crawler.settings.getint('CONCURRENT_REQUESTS')
crawler.signals.connect(self.response_received, signal=signals.response_received)
crawler.signals.connect(self.request_dropped, signal=signals.request_dropped)
@classmethod
def from_crawler(cls, crawler):
ext = cls(crawler)
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
return ext
def response_received(self, response, request, spider):
if response.status >= 400:
self.success_rate *= 0.95
else:
self.success_rate = min(1.0, self.success_rate * 1.01)
self._adjust_concurrency()
def _adjust_concurrency(self):
"""Adjust concurrency based on success rate"""
if self.success_rate > 0.95:
new_concurrency = min(
self.max_concurrency,
int(self.crawler.engine.downloader.active * 1.1)
)
# Apply new concurrency (implementation depends on Scrapy version)