Python源码示例:scrapy.exceptions.IgnoreRequest()
示例1
def _redirect(self, redirected, request, spider, reason):
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
redirects = request.meta.get('redirect_times', 0) + 1
if ttl and redirects <= self.max_redirect_times:
redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_ttl'] = ttl - 1
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
[request.url]
redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \
[reason]
redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
{'reason': reason, 'redirected': redirected, 'request': request},
extra={'spider': spider})
return redirected
else:
logger.debug("Discarding %(request)s: max redirections reached",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest("max redirections reached")
示例2
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
if isinstance(request_or_url, Request):
request = request_or_url
else:
url = any_to_uri(request_or_url)
request = Request(url, dont_filter=True, **kwargs)
if redirect:
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
else:
request.meta['handle_httpstatus_all'] = True
response = None
try:
response, spider = threads.blockingCallFromThread(
reactor, self._schedule, request, spider)
except IgnoreRequest:
pass
self.populate_vars(response, request, spider)
示例3
def _log_download_errors(self, spider_failure, download_failure, request, spider):
"""Log and silence errors that come from the engine (typically download
errors that got propagated thru here)
"""
if (isinstance(download_failure, Failure) and
not download_failure.check(IgnoreRequest)):
if download_failure.frames:
logger.error('Error downloading %(request)s',
{'request': request},
exc_info=failure_to_exc_info(download_failure),
extra={'spider': spider})
else:
errmsg = download_failure.getErrorMessage()
if errmsg:
logger.error('Error downloading %(request)s: %(errmsg)s',
{'request': request, 'errmsg': errmsg},
extra={'spider': spider})
if spider_failure is not download_failure:
return spider_failure
示例4
def _redirect(self, redirected, request, spider, reason):
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
redirects = request.meta.get('redirect_times', 0) + 1
if ttl and redirects <= self.max_redirect_times:
redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_ttl'] = ttl - 1
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
[request.url]
redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \
[reason]
redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
{'reason': reason, 'redirected': redirected, 'request': request},
extra={'spider': spider})
return redirected
else:
logger.debug("Discarding %(request)s: max redirections reached",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest("max redirections reached")
示例5
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
if isinstance(request_or_url, Request):
request = request_or_url
else:
url = any_to_uri(request_or_url)
request = Request(url, dont_filter=True, **kwargs)
if redirect:
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
else:
request.meta['handle_httpstatus_all'] = True
response = None
try:
response, spider = threads.blockingCallFromThread(
reactor, self._schedule, request, spider)
except IgnoreRequest:
pass
self.populate_vars(response, request, spider)
示例6
def process_exception(self, request, exception, spider):
if isinstance(exception, (IgnoreRequest, DropItem)):
return
if not self._is_enabled_for_request(request):
return
autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)
stop_time = time.time()
latency = time.time() - autoextract['timing']['start_ts']
autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})
# Make sure to log all unknown failures
logger.warning('AutoExtract failure after %.3fs for %s: %s',
latency,
autoextract['original_url'],
repr(exception),
extra={'spider': spider})
request.meta['autoextract'] = autoextract
ex_class = global_object_name(exception.__class__)
self.inc_metric('autoextract/errors/total_count', spider=spider)
self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider)
示例7
def process_request(self, request, spider):
# don't use this middleware while testing is site is up
if hasattr(spider, "test") and spider.test=="yes":
#logger = logging.getLogger()
#logger.info("Testing mode, dead domains disabled")
return None
if not Domain.is_onion_url(request.url):
return None
domain = Domain.find_by_url(request.url)
if not domain or domain.is_up:
return None
raise IgnoreRequest('Domain %s is dead, skipping' % domain.host)
示例8
def process_request(self, request, spider):
parsed_url = urlparse.urlparse(request.url)
if not self.test_mode or not parsed_url.path in ["/", ""]:
return None
if not Domain.is_onion_url(request.url):
return None
d = Domain.find_by_url(request.url)
if d is None:
return None
now = datetime.now()
if now > d.next_scheduled_check:
return None
else:
raise IgnoreRequest('FilterNotScheduledMiddleware: %s is not scheduled to check' % d.host)
示例9
def process_request(self, request, spider):
if 'x-ignore-request' in request.url:
raise IgnoreRequest()
elif 'x-error-request' in request.url:
_ = 1 / 0
示例10
def process_response(self, request, response, spider):
if 'x-ignore-response' in request.url:
raise IgnoreRequest()
elif 'x-error-response' in request.url:
_ = 1 / 0
else:
return response
示例11
def process_request(self, request, spider):
if not request.url:
return None
channel_id = request.meta.get('channel_id', 0)
# 处理详情页面(忽略列表页面)与pipeline配合
if is_dup_detail(request.url, spider.name, channel_id):
raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url))
示例12
def process_request(self, request, spider):
# 处理微信反爬(反爬机制一, sogou)
if spider.name in ['weixin'] and 'antispider' in request.url:
# 获取来源链接
redirect_urls = request.meta['redirect_urls']
# 清理失效 cookies
cookies_id = request.meta['cookiejar']
del_cookies(spider.name, cookies_id)
# spider.log(message='AntiSpider cookies_id: %s; url: %s' % (cookies_id, redirect_urls[0]))
raise IgnoreRequest(
'Spider: %s, AntiSpider cookies_id: %s; url: %s' % (spider.name, cookies_id, redirect_urls[0]))
示例13
def mustbe_deferred(f, *args, **kw):
"""Same as twisted.internet.defer.maybeDeferred, but delay calling
callback/errback to next reactor loop
"""
try:
result = f(*args, **kw)
# FIXME: Hack to avoid introspecting tracebacks. This to speed up
# processing of IgnoreRequest errors which are, by far, the most common
# exception in Scrapy - see #125
except IgnoreRequest as e:
return defer_fail(failure.Failure(e))
except Exception:
return defer_fail(failure.Failure())
else:
return defer_result(result)
示例14
def process_request_2(self, rp, request, spider):
if rp is None:
return
if not rp.can_fetch(to_native_str(self._useragent), request.url):
logger.debug("Forbidden by robots.txt: %(request)s",
{'request': request}, extra={'spider': spider})
self.crawler.stats.inc_value('robotstxt/forbidden')
raise IgnoreRequest("Forbidden by robots.txt")
示例15
def _logerror(self, failure, request, spider):
if failure.type is not IgnoreRequest:
logger.error("Error downloading %(request)s: %(f_exception)s",
{'request': request, 'f_exception': failure.value},
exc_info=failure_to_exc_info(failure),
extra={'spider': spider})
return failure
示例16
def _robots_error(self, failure, netloc):
if failure.type is not IgnoreRequest:
key = 'robotstxt/exception_count/{}'.format(failure.type)
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = None
rp_dfd.callback(None)
示例17
def media_failed(self, failure, request, info):
if not isinstance(failure.value, IgnoreRequest):
referer = referer_str(request)
logger.warning(
'File (unknown-error): Error downloading %(medianame)s from '
'%(request)s referred in <%(referer)s>: %(exception)s',
{'medianame': self.MEDIA_NAME, 'request': request,
'referer': referer, 'exception': failure.value},
extra={'spider': info.spider}
)
raise FileException
示例18
def mustbe_deferred(f, *args, **kw):
"""Same as twisted.internet.defer.maybeDeferred, but delay calling
callback/errback to next reactor loop
"""
try:
result = f(*args, **kw)
# FIXME: Hack to avoid introspecting tracebacks. This to speed up
# processing of IgnoreRequest errors which are, by far, the most common
# exception in Scrapy - see #125
except IgnoreRequest as e:
return defer_fail(failure.Failure(e))
except Exception:
return defer_fail(failure.Failure())
else:
return defer_result(result)
示例19
def _logerror(self, failure, request, spider):
if failure.type is not IgnoreRequest:
logger.error("Error downloading %(request)s: %(f_exception)s",
{'request': request, 'f_exception': failure.value},
exc_info=failure_to_exc_info(failure),
extra={'spider': spider})
return failure
示例20
def _robots_error(self, failure, netloc):
if failure.type is not IgnoreRequest:
key = 'robotstxt/exception_count/{}'.format(failure.type)
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = None
rp_dfd.callback(None)
示例21
def process_request(self, request, spider):
if request.meta.get('dont_cache', False):
return
# Skip uncacheable requests
if not self.policy.should_cache_request(request):
request.meta['_dont_cache'] = True # flag as uncacheable
return
# Look for cached response and check if expired
cachedresponse = self.storage.retrieve_response(spider, request)
if cachedresponse is None:
self.stats.inc_value('httpcache/miss', spider=spider)
if self.ignore_missing:
self.stats.inc_value('httpcache/ignore', spider=spider)
raise IgnoreRequest("Ignored request not in cache: %s" % request)
return # first time request
# Return cached response only if not expired
cachedresponse.flags.append('cached')
if self.policy.is_cached_response_fresh(cachedresponse, request):
self.stats.inc_value('httpcache/hit', spider=spider)
return cachedresponse
# Keep a reference to cached response to avoid a second cache lookup on
# process_response hook
request.meta['cached_response'] = cachedresponse
示例22
def media_failed(self, failure, request, info):
if not isinstance(failure.value, IgnoreRequest):
referer = referer_str(request)
logger.warning(
'File (unknown-error): Error downloading %(medianame)s from '
'%(request)s referred in <%(referer)s>: %(exception)s',
{'medianame': self.MEDIA_NAME, 'request': request,
'referer': referer, 'exception': failure.value},
extra={'spider': info.spider}
)
raise FileException
示例23
def process_request(self, request, spider):
if request.url not in spider.start_urls and (redis_conn.hexists(redis_url_key, request.url) or redis_conn.hexists(redis_invalid_url_key, request.url)):
logger.info("Skip URL: %s, has been crawled" % request.url)
raise IgnoreRequest("URL %s has been crawled" % request.url)
示例24
def process_request(self, request, spider):
if not request.meta.get('crawl_once', self.default):
return
if self._get_key(request) in self.db:
self.stats.inc_value('crawl_once/ignored')
raise IgnoreRequest()
示例25
def process_spider_exception(self, response, exception, spider):
if (self.on_error_enabled and
not isinstance(exception, IgnoreRequest) and
self.counters['error'] < self.limits['error']):
self.counters['error'] += 1
self.save_response(response, spider)
示例26
def test_process_spider_exception(self):
assert self.instance.counters == {'all': 0, 'error': 0}
self.instance.save_response = mock.Mock()
# all conditions are true
self.instance.on_error_enabled = True
self.instance.process_spider_exception(
'err-response', Exception(), self.spider)
assert self.instance.counters == {'all': 0, 'error': 1}
# on_error flag is disabled, skipping
self.instance.on_error_enabled = False
self.instance.process_spider_exception(
'err-response', Exception(), self.spider)
assert self.instance.counters == {'all': 0, 'error': 1}
# exceeded error limit
self.instance.on_error_enabled = True
self.instance.counters['error'] = 11
self.instance.process_spider_exception(
'err-response', Exception(), self.spider)
assert self.instance.counters == {'all': 0, 'error': 11}
# skip IgnoreRequest
self.instance.limits['error'] = 12
self.instance.process_spider_exception(
'err-response', IgnoreRequest(), self.spider)
assert self.instance.counters == {'all': 0, 'error': 11}
# all conditions are true again
self.instance.limits['all'] = 12
self.instance.process_spider_exception(
'err-response', Exception(), self.spider)
assert self.instance.counters == {'all': 0, 'error': 12}
示例27
def process_request(self, request, spider):
if not Domain.is_onion_url(request.url):
return None
parsed_url = urlparse.urlparse(request.url)
host = parsed_url.hostname
subdomains = host.count(".")
if subdomains > 2:
raise IgnoreRequest('Too many subdomains (%d > 2)' % subdomains)
return None
示例28
def process_request(self, request, spider):
parsed_url = urlparse.urlparse(request.url)
host = parsed_url.hostname
if self.counter[host] < self.max_pages:
self.counter[host] += 1
spider.logger.info('Page count is %d for %s' % (self.counter[host], host))
return None
else:
raise IgnoreRequest('MAX_PAGES_PER_DOMAIN reached, filtered %s' % request.url)
示例29
def test_middleware():
Rq = lambda path: Request(
'http://example.com{}'.format(path),
meta={'avoid_dup_content': True})
Rs = lambda req, body: HtmlResponse(
req.url, body=body.encode(), request=req)
mw = AvoidDupContentMiddleware(
initial_queue_limit=1, threshold=0.5, exploration=0.00)
spider = Spider()
req = Rq('/')
mw.process_request(req, spider)
mw.process_response(req, Rs(req, ''), spider)
assert mw.dupe_predictor
n_dropped = 0
for i in range(10):
req = Rq('/viewtopic.php?topic_id={}'.format(i))
mw.process_request(req, spider)
mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i))
try:
mw.process_request(req, spider)
except IgnoreRequest:
n_dropped += 1
else:
mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
mw.dupe_predictor.log_dupstats(min_dup=0)
assert n_dropped == 5
# one request in different order
req = Rq('/viewtopic.php?topic_id=100&start=0')
mw.process_request(req, spider)
mw.process_response(req, Rs(req, ''), spider)
mw.process_request(Rq('/viewtopic.php?topic_id=200'), spider)
with pytest.raises(IgnoreRequest):
mw.process_request(Rq('/viewtopic.php?topic_id=100'), spider)
# test exploration
mw.exploration = 0.5
n_dropped = 0
n_requests = 0
for i in range(150, 170):
req = Rq('/viewtopic.php?topic_id={}'.format(i))
mw.process_request(req, spider)
mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i))
n_requests += 1
try:
mw.process_request(req, spider)
except IgnoreRequest:
n_dropped += 1
else:
mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
assert n_dropped > 0
assert n_dropped < n_requests
示例30
def test_crawl(tmpdir):
settings = {'CRAWL_ONCE_PATH': str(tmpdir)}
crawler = get_crawler(settings_dict=settings)
req1 = scrapy.Request('http://example.com/1', meta={'crawl_once': True})
req2 = scrapy.Request('http://example.com/2')
req3 = scrapy.Request('http://example.com/3', meta={'crawl_once': True})
resp1 = Response(req1.url, request=req1)
resp2 = Response(req2.url, request=req2)
with opened_middleware(crawler) as mw:
# 1. check spider middleware interface
assert len(mw.db) == 0
assert crawler.stats.get_value('crawl_once/initial') == 0
output = [{}, scrapy.Request('http://example.com')]
# crawl_once is False
res = list(mw.process_spider_output(resp2, output, crawler.spider))
assert res == output
assert len(mw.db) == 0
# crawl_once is True
res = list(mw.process_spider_output(resp1, output, crawler.spider))
assert res == output
assert len(mw.db) == 1
assert crawler.stats.get_value('crawl_once/initial') == 0
assert crawler.stats.get_value('crawl_once/stored') == 1
# 2. check downloader middleware interface
assert mw.process_request(req2, crawler.spider) is None
assert crawler.stats.get_value('crawl_once/ignored', 0) == 0
with pytest.raises(IgnoreRequest):
mw.process_request(req1, crawler.spider)
assert crawler.stats.get_value('crawl_once/ignored', 0) == 1
assert mw.process_request(req3, crawler.spider) is None
assert crawler.stats.get_value('crawl_once/ignored', 0) == 1
assert crawler.stats.get_value('crawl_once/initial') == 0
crawler = get_crawler(settings_dict=settings)
with opened_middleware(crawler) as mw2:
# it reuses the same file, so there are records
assert len(mw2.db) == 1
assert crawler.stats.get_value('crawl_once/initial') == 1
assert mw2.process_request(req2, crawler.spider) is None
assert crawler.stats.get_value('crawl_once/ignored', 0) == 0
with pytest.raises(IgnoreRequest):
mw2.process_request(req1, crawler.spider)
assert crawler.stats.get_value('crawl_once/ignored', 0) == 1
assert mw2.process_request(req3, crawler.spider) is None