Scrapy 框架中的 Residential Proxies 高级配置与性能调优

发布: (2025年12月3日 GMT+8 18:48)
3 min read
原文: Dev.to

Source: Dev.to

在使用 Scrapy 进行大规模、高频率的数据抓取时,简单的代理设置很快就会显得不足。随机 IP 轮换和固定延迟会导致效率低下、浪费 IP,并触发高级的反爬虫机制。将住宅代理深度集成到 Scrapy 中,并结合性能调优,是构建工业级、稳健且高效的数据管道的关键。

推荐架构:可扩展的代理池中间件系统

Scrapy Request

[Residential Proxy Middleware] ←→ [External Proxy Pool Manager]
        |                                 |
        | (Acquire/Release Proxy)         | (Manage IP Health Status,
        |                                 |  Implement Smart Rotation)
        ↓                                 ↓
[Target Website]                  [Rapidproxy API / Dashboard]

将代理获取逻辑与请求处理解耦,使代理管理更加灵活和智能。

中间件实现

# middlewares.py
import random
import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured
from your_project.proxy_pool import ProxyPoolClient  # Hypothetical proxy pool client

class AdvancedResidentialProxyMiddleware:
    def __init__(self, proxy_pool_client):
        self.proxy_pool = proxy_pool_client
        self.logger = logging.getLogger(__name__)
        self.stats = {}

    @classmethod
    def from_crawler(cls, crawler):
        pool_config = crawler.settings.getdict('RESIDENTIAL_PROXY_POOL')
        if not pool_config:
            raise NotConfigured('RESIDENTIAL_PROXY_POOL not configured')

        proxy_pool = ProxyPoolClient(**pool_config)
        middleware = cls(proxy_pool)

        crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
        return middleware

    def process_request(self, request, spider):
        if 'proxy' in request.meta:
            return

        target_domain = request.url.split('/')[2]
        proxy_strategy = self._select_strategy(target_domain, spider)

        proxy = self.proxy_pool.acquire_proxy(strategy=proxy_strategy)
        if not proxy:
            self.logger.error(f"No available proxy for {target_domain}")
            raise Exception("ProxyPoolExhausted")

        request.meta['proxy'] = proxy['endpoint']
        request.meta['proxy_meta'] = proxy

        if proxy.get('auth'):
            request.headers['Proxy-Authorization'] = proxy['auth']

        proxy_key = proxy['id']
        self.stats[proxy_key] = self.stats.get(proxy_key, 0) + 1

    def process_response(self, request, response, spider):
        proxy_meta = request.meta.get('proxy_meta')
        if proxy_meta:
            self.proxy_pool.report_success(proxy_meta['id'])
        return response

    def process_exception(self, request, exception, spider):
        proxy_meta = request.meta.get('proxy_meta')
        if proxy_meta:
            self.proxy_pool.report_failure(proxy_meta['id'])
        return None

    def spider_closed(self, spider, reason):
        self.logger.info(f"Proxy usage statistics: {self.stats}")
        self.proxy_pool.cleanup()

    def _select_strategy(self, domain, spider):
        """Select proxy strategy based on target domain"""
        domain_strategies = spider.settings.get('DOMAIN_PROXY_STRATEGIES', {})

        if domain in domain_strategies:
            return domain_strategies[domain]

        return {
            'strategy': 'random',
            'location': 'global',
            'session_ttl': random.randint(30, 300)
        }

Scrapy 设置

# settings.py
DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
    'your_project.middlewares.AdvancedResidentialProxyMiddleware': 100,
    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
    'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
}

CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 2
DOWNLOAD_DELAY = 0.5
RANDOMIZE_DOWNLOAD_DELAY = True
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1.0
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0

RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429, 403]

RESIDENTIAL_PROXY_POOL = {
    'api_endpoint': 'https://api.rapidproxy.io/v1/pool',
    'api_key': 'your_api_key_here',  # Prefer reading from env var
    'default_location': 'us',
    'max_ip_per_domain': 3,
    'health_check_interval': 60,
}

DOMAIN_PROXY_STRATEGIES = {
    'amazon.com': {'location': 'us', 'strategy': 'sticky', 'session_ttl': 600},
    'taobao.com': {'location': 'cn', 'strategy': 'rotate', 'rotate_interval': 30},
    'example.co.uk': {'location': 'gb', 'strategy': 'random'},
}

DOWNLOAD_TIMEOUT = 30

自适应并发扩展

# extensions.py
from scrapy import signals
from scrapy.exceptions import NotConfigured

class AdaptiveConcurrencyExtension:
    def __init__(self, crawler):
        self.crawler = crawler
        self.success_rate = 1.0
        self.min_concurrency = 1
        self.max_concurrency = crawler.settings.getint('CONCURRENT_REQUESTS')

        crawler.signals.connect(self.response_received, signal=signals.response_received)
        crawler.signals.connect(self.request_dropped, signal=signals.request_dropped)

    @classmethod
    def from_crawler(cls, crawler):
        ext = cls(crawler)
        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        return ext

    def response_received(self, response, request, spider):
        if response.status >= 400:
            self.success_rate *= 0.95
        else:
            self.success_rate = min(1.0, self.success_rate * 1.01)

        self._adjust_concurrency()

    def _adjust_concurrency(self):
        """Adjust concurrency based on success rate"""
        if self.success_rate > 0.95:
            new_concurrency = min(
                self.max_concurrency,
                int(self.crawler.engine.downloader.active * 1.1)
            )
            # Apply new concurrency (implementation depends on Scrapy version)
Back to Blog

相关文章

阅读更多 »