Scrapy 프레임워크에서 Residential Proxies를 위한 고급 구성 및 성능 튜닝
Source: Dev.to
Scrapy를 사용해 대규모·고빈도 데이터 스크래핑을 수행할 때, 단순한 프록시 설정만으로는 금세 한계에 부딪힙니다. 무작위 IP 회전과 고정 지연은 비효율을 초래하고 IP를 낭비하며 정교한 안티봇 메커니즘을 유발할 수 있습니다. Scrapy와 주거용 프록시를 깊게 통합하고 성능 튜닝을 병행하는 것이 산업 수준의 견고하고 효율적인 데이터 파이프라인을 구축하는 데 필수적입니다.
추천 아키텍처: 확장 가능한 프록시 풀 미들웨어 시스템
Scrapy Request
↓
[Residential Proxy Middleware] ←→ [External Proxy Pool Manager]
| |
| (Acquire/Release Proxy) | (Manage IP Health Status,
| | Implement Smart Rotation)
↓ ↓
[Target Website] [Rapidproxy API / Dashboard]
프록시 획득 로직을 요청 처리와 분리하면 프록시 관리가 보다 유연하고 지능적으로 변합니다.
미들웨어 구현
# middlewares.py
import random
import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured
from your_project.proxy_pool import ProxyPoolClient # Hypothetical proxy pool client
class AdvancedResidentialProxyMiddleware:
def __init__(self, proxy_pool_client):
self.proxy_pool = proxy_pool_client
self.logger = logging.getLogger(__name__)
self.stats = {}
@classmethod
def from_crawler(cls, crawler):
pool_config = crawler.settings.getdict('RESIDENTIAL_PROXY_POOL')
if not pool_config:
raise NotConfigured('RESIDENTIAL_PROXY_POOL not configured')
proxy_pool = ProxyPoolClient(**pool_config)
middleware = cls(proxy_pool)
crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
return middleware
def process_request(self, request, spider):
if 'proxy' in request.meta:
return
target_domain = request.url.split('/')[2]
proxy_strategy = self._select_strategy(target_domain, spider)
proxy = self.proxy_pool.acquire_proxy(strategy=proxy_strategy)
if not proxy:
self.logger.error(f"No available proxy for {target_domain}")
raise Exception("ProxyPoolExhausted")
request.meta['proxy'] = proxy['endpoint']
request.meta['proxy_meta'] = proxy
if proxy.get('auth'):
request.headers['Proxy-Authorization'] = proxy['auth']
proxy_key = proxy['id']
self.stats[proxy_key] = self.stats.get(proxy_key, 0) + 1
def process_response(self, request, response, spider):
proxy_meta = request.meta.get('proxy_meta')
if proxy_meta:
self.proxy_pool.report_success(proxy_meta['id'])
return response
def process_exception(self, request, exception, spider):
proxy_meta = request.meta.get('proxy_meta')
if proxy_meta:
self.proxy_pool.report_failure(proxy_meta['id'])
return None
def spider_closed(self, spider, reason):
self.logger.info(f"Proxy usage statistics: {self.stats}")
self.proxy_pool.cleanup()
def _select_strategy(self, domain, spider):
"""Select proxy strategy based on target domain"""
domain_strategies = spider.settings.get('DOMAIN_PROXY_STRATEGIES', {})
if domain in domain_strategies:
return domain_strategies[domain]
return {
'strategy': 'random',
'location': 'global',
'session_ttl': random.randint(30, 300)
}
Scrapy 설정
# settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'your_project.middlewares.AdvancedResidentialProxyMiddleware': 100,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
}
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 2
DOWNLOAD_DELAY = 0.5
RANDOMIZE_DOWNLOAD_DELAY = True
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1.0
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429, 403]
RESIDENTIAL_PROXY_POOL = {
'api_endpoint': 'https://api.rapidproxy.io/v1/pool',
'api_key': 'your_api_key_here', # Prefer reading from env var
'default_location': 'us',
'max_ip_per_domain': 3,
'health_check_interval': 60,
}
DOMAIN_PROXY_STRATEGIES = {
'amazon.com': {'location': 'us', 'strategy': 'sticky', 'session_ttl': 600},
'taobao.com': {'location': 'cn', 'strategy': 'rotate', 'rotate_interval': 30},
'example.co.uk': {'location': 'gb', 'strategy': 'random'},
}
DOWNLOAD_TIMEOUT = 30
적응형 동시성 확장
# extensions.py
from scrapy import signals
from scrapy.exceptions import NotConfigured
class AdaptiveConcurrencyExtension:
def __init__(self, crawler):
self.crawler = crawler
self.success_rate = 1.0
self.min_concurrency = 1
self.max_concurrency = crawler.settings.getint('CONCURRENT_REQUESTS')
crawler.signals.connect(self.response_received, signal=signals.response_received)
crawler.signals.connect(self.request_dropped, signal=signals.request_dropped)
@classmethod
def from_crawler(cls, crawler):
ext = cls(crawler)
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
return ext
def response_received(self, response, request, spider):
if response.status >= 400:
self.success_rate *= 0.95
else:
self.success_rate = min(1.0, self.success_rate * 1.01)
self._adjust_concurrency()
def _adjust_concurrency(self):
"""Adjust concurrency based on success rate"""
if self.success_rate > 0.95:
new_concurrency = min(
self.max_concurrency,
int(self.crawler.engine.downloader.active * 1.1)
)
# Apply new concurrency (implementation depends on Scrapy version)