Scrapy 框架中的 Residential Proxies 高级配置与性能调优
发布: (2025年12月3日 GMT+8 18:48)
3 min read
原文: Dev.to
Source: Dev.to
在使用 Scrapy 进行大规模、高频率的数据抓取时,简单的代理设置很快就会显得不足。随机 IP 轮换和固定延迟会导致效率低下、浪费 IP,并触发高级的反爬虫机制。将住宅代理深度集成到 Scrapy 中,并结合性能调优,是构建工业级、稳健且高效的数据管道的关键。
推荐架构:可扩展的代理池中间件系统
Scrapy Request
↓
[Residential Proxy Middleware] ←→ [External Proxy Pool Manager]
| |
| (Acquire/Release Proxy) | (Manage IP Health Status,
| | Implement Smart Rotation)
↓ ↓
[Target Website] [Rapidproxy API / Dashboard]
将代理获取逻辑与请求处理解耦,使代理管理更加灵活和智能。
中间件实现
# middlewares.py
import random
import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured
from your_project.proxy_pool import ProxyPoolClient # Hypothetical proxy pool client
class AdvancedResidentialProxyMiddleware:
def __init__(self, proxy_pool_client):
self.proxy_pool = proxy_pool_client
self.logger = logging.getLogger(__name__)
self.stats = {}
@classmethod
def from_crawler(cls, crawler):
pool_config = crawler.settings.getdict('RESIDENTIAL_PROXY_POOL')
if not pool_config:
raise NotConfigured('RESIDENTIAL_PROXY_POOL not configured')
proxy_pool = ProxyPoolClient(**pool_config)
middleware = cls(proxy_pool)
crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
return middleware
def process_request(self, request, spider):
if 'proxy' in request.meta:
return
target_domain = request.url.split('/')[2]
proxy_strategy = self._select_strategy(target_domain, spider)
proxy = self.proxy_pool.acquire_proxy(strategy=proxy_strategy)
if not proxy:
self.logger.error(f"No available proxy for {target_domain}")
raise Exception("ProxyPoolExhausted")
request.meta['proxy'] = proxy['endpoint']
request.meta['proxy_meta'] = proxy
if proxy.get('auth'):
request.headers['Proxy-Authorization'] = proxy['auth']
proxy_key = proxy['id']
self.stats[proxy_key] = self.stats.get(proxy_key, 0) + 1
def process_response(self, request, response, spider):
proxy_meta = request.meta.get('proxy_meta')
if proxy_meta:
self.proxy_pool.report_success(proxy_meta['id'])
return response
def process_exception(self, request, exception, spider):
proxy_meta = request.meta.get('proxy_meta')
if proxy_meta:
self.proxy_pool.report_failure(proxy_meta['id'])
return None
def spider_closed(self, spider, reason):
self.logger.info(f"Proxy usage statistics: {self.stats}")
self.proxy_pool.cleanup()
def _select_strategy(self, domain, spider):
"""Select proxy strategy based on target domain"""
domain_strategies = spider.settings.get('DOMAIN_PROXY_STRATEGIES', {})
if domain in domain_strategies:
return domain_strategies[domain]
return {
'strategy': 'random',
'location': 'global',
'session_ttl': random.randint(30, 300)
}
Scrapy 设置
# settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'your_project.middlewares.AdvancedResidentialProxyMiddleware': 100,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
}
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 2
DOWNLOAD_DELAY = 0.5
RANDOMIZE_DOWNLOAD_DELAY = True
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1.0
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429, 403]
RESIDENTIAL_PROXY_POOL = {
'api_endpoint': 'https://api.rapidproxy.io/v1/pool',
'api_key': 'your_api_key_here', # Prefer reading from env var
'default_location': 'us',
'max_ip_per_domain': 3,
'health_check_interval': 60,
}
DOMAIN_PROXY_STRATEGIES = {
'amazon.com': {'location': 'us', 'strategy': 'sticky', 'session_ttl': 600},
'taobao.com': {'location': 'cn', 'strategy': 'rotate', 'rotate_interval': 30},
'example.co.uk': {'location': 'gb', 'strategy': 'random'},
}
DOWNLOAD_TIMEOUT = 30
自适应并发扩展
# extensions.py
from scrapy import signals
from scrapy.exceptions import NotConfigured
class AdaptiveConcurrencyExtension:
def __init__(self, crawler):
self.crawler = crawler
self.success_rate = 1.0
self.min_concurrency = 1
self.max_concurrency = crawler.settings.getint('CONCURRENT_REQUESTS')
crawler.signals.connect(self.response_received, signal=signals.response_received)
crawler.signals.connect(self.request_dropped, signal=signals.request_dropped)
@classmethod
def from_crawler(cls, crawler):
ext = cls(crawler)
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
return ext
def response_received(self, response, request, spider):
if response.status >= 400:
self.success_rate *= 0.95
else:
self.success_rate = min(1.0, self.success_rate * 1.01)
self._adjust_concurrency()
def _adjust_concurrency(self):
"""Adjust concurrency based on success rate"""
if self.success_rate > 0.95:
new_concurrency = min(
self.max_concurrency,
int(self.crawler.engine.downloader.active * 1.1)
)
# Apply new concurrency (implementation depends on Scrapy version)