Scrapy 프레임워크에서 Residential Proxies를 위한 고급 구성 및 성능 튜닝

발행: (2025년 12월 3일 오후 07:48 GMT+9)
4 min read
원문: Dev.to

Source: Dev.to

Scrapy를 사용해 대규모·고빈도 데이터 스크래핑을 수행할 때, 단순한 프록시 설정만으로는 금세 한계에 부딪힙니다. 무작위 IP 회전과 고정 지연은 비효율을 초래하고 IP를 낭비하며 정교한 안티봇 메커니즘을 유발할 수 있습니다. Scrapy와 주거용 프록시를 깊게 통합하고 성능 튜닝을 병행하는 것이 산업 수준의 견고하고 효율적인 데이터 파이프라인을 구축하는 데 필수적입니다.

추천 아키텍처: 확장 가능한 프록시 풀 미들웨어 시스템

Scrapy Request

[Residential Proxy Middleware] ←→ [External Proxy Pool Manager]
        |                                 |
        | (Acquire/Release Proxy)         | (Manage IP Health Status,
        |                                 |  Implement Smart Rotation)
        ↓                                 ↓
[Target Website]                  [Rapidproxy API / Dashboard]

프록시 획득 로직을 요청 처리와 분리하면 프록시 관리가 보다 유연하고 지능적으로 변합니다.

미들웨어 구현

# middlewares.py
import random
import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured
from your_project.proxy_pool import ProxyPoolClient  # Hypothetical proxy pool client

class AdvancedResidentialProxyMiddleware:
    def __init__(self, proxy_pool_client):
        self.proxy_pool = proxy_pool_client
        self.logger = logging.getLogger(__name__)
        self.stats = {}

    @classmethod
    def from_crawler(cls, crawler):
        pool_config = crawler.settings.getdict('RESIDENTIAL_PROXY_POOL')
        if not pool_config:
            raise NotConfigured('RESIDENTIAL_PROXY_POOL not configured')

        proxy_pool = ProxyPoolClient(**pool_config)
        middleware = cls(proxy_pool)

        crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
        return middleware

    def process_request(self, request, spider):
        if 'proxy' in request.meta:
            return

        target_domain = request.url.split('/')[2]
        proxy_strategy = self._select_strategy(target_domain, spider)

        proxy = self.proxy_pool.acquire_proxy(strategy=proxy_strategy)
        if not proxy:
            self.logger.error(f"No available proxy for {target_domain}")
            raise Exception("ProxyPoolExhausted")

        request.meta['proxy'] = proxy['endpoint']
        request.meta['proxy_meta'] = proxy

        if proxy.get('auth'):
            request.headers['Proxy-Authorization'] = proxy['auth']

        proxy_key = proxy['id']
        self.stats[proxy_key] = self.stats.get(proxy_key, 0) + 1

    def process_response(self, request, response, spider):
        proxy_meta = request.meta.get('proxy_meta')
        if proxy_meta:
            self.proxy_pool.report_success(proxy_meta['id'])
        return response

    def process_exception(self, request, exception, spider):
        proxy_meta = request.meta.get('proxy_meta')
        if proxy_meta:
            self.proxy_pool.report_failure(proxy_meta['id'])
        return None

    def spider_closed(self, spider, reason):
        self.logger.info(f"Proxy usage statistics: {self.stats}")
        self.proxy_pool.cleanup()

    def _select_strategy(self, domain, spider):
        """Select proxy strategy based on target domain"""
        domain_strategies = spider.settings.get('DOMAIN_PROXY_STRATEGIES', {})

        if domain in domain_strategies:
            return domain_strategies[domain]

        return {
            'strategy': 'random',
            'location': 'global',
            'session_ttl': random.randint(30, 300)
        }

Scrapy 설정

# settings.py
DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
    'your_project.middlewares.AdvancedResidentialProxyMiddleware': 100,
    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
    'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
}

CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 2
DOWNLOAD_DELAY = 0.5
RANDOMIZE_DOWNLOAD_DELAY = True
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1.0
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0

RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429, 403]

RESIDENTIAL_PROXY_POOL = {
    'api_endpoint': 'https://api.rapidproxy.io/v1/pool',
    'api_key': 'your_api_key_here',  # Prefer reading from env var
    'default_location': 'us',
    'max_ip_per_domain': 3,
    'health_check_interval': 60,
}

DOMAIN_PROXY_STRATEGIES = {
    'amazon.com': {'location': 'us', 'strategy': 'sticky', 'session_ttl': 600},
    'taobao.com': {'location': 'cn', 'strategy': 'rotate', 'rotate_interval': 30},
    'example.co.uk': {'location': 'gb', 'strategy': 'random'},
}

DOWNLOAD_TIMEOUT = 30

적응형 동시성 확장

# extensions.py
from scrapy import signals
from scrapy.exceptions import NotConfigured

class AdaptiveConcurrencyExtension:
    def __init__(self, crawler):
        self.crawler = crawler
        self.success_rate = 1.0
        self.min_concurrency = 1
        self.max_concurrency = crawler.settings.getint('CONCURRENT_REQUESTS')

        crawler.signals.connect(self.response_received, signal=signals.response_received)
        crawler.signals.connect(self.request_dropped, signal=signals.request_dropped)

    @classmethod
    def from_crawler(cls, crawler):
        ext = cls(crawler)
        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        return ext

    def response_received(self, response, request, spider):
        if response.status >= 400:
            self.success_rate *= 0.95
        else:
            self.success_rate = min(1.0, self.success_rate * 1.01)

        self._adjust_concurrency()

    def _adjust_concurrency(self):
        """Adjust concurrency based on success rate"""
        if self.success_rate > 0.95:
            new_concurrency = min(
                self.max_concurrency,
                int(self.crawler.engine.downloader.active * 1.1)
            )
            # Apply new concurrency (implementation depends on Scrapy version)
Back to Blog

관련 글

더 보기 »