Python源码示例:scrapy.exceptions.CloseSpider()
示例1
def handle_spider_error(self, _failure, request, response, spider):
exc = _failure.value
if isinstance(exc, CloseSpider):
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
return
logger.error(
"Spider error processing %(request)s (referer: %(referer)s)",
{'request': request, 'referer': referer_str(request)},
exc_info=failure_to_exc_info(_failure),
extra={'spider': spider}
)
self.signals.send_catch_log(
signal=signals.spider_error,
failure=_failure, response=response,
spider=spider
)
self.crawler.stats.inc_value(
"spider_exceptions/%s" % _failure.value.__class__.__name__,
spider=spider
)
示例2
def handle_spider_error(self, _failure, request, response, spider):
exc = _failure.value
if isinstance(exc, CloseSpider):
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
return
logger.error(
"Spider error processing %(request)s (referer: %(referer)s)",
{'request': request, 'referer': referer_str(request)},
exc_info=failure_to_exc_info(_failure),
extra={'spider': spider}
)
self.signals.send_catch_log(
signal=signals.spider_error,
failure=_failure, response=response,
spider=spider
)
self.crawler.stats.inc_value(
"spider_exceptions/%s" % _failure.value.__class__.__name__,
spider=spider
)
示例3
def parse_item(self, response):
item = InventusSpiderItem()
for url in Selector(text=response.body).xpath('//a/@href').extract():
if not url.startswith('http://') or url.startswith('https://'):
url = self.base_url + url
try:
parsed_uri = urlparse(url)
except ValueError:
# If the URL is invalid we can ignore it.
continue
if parsed_uri.netloc.endswith('.' + self.domain) and 'mailto:' not in url:
if not parsed_uri.netloc in self.subdomains:
self.subdomains.append(parsed_uri.netloc)
item['subdomain'] = parsed_uri.netloc
yield item
if len(self.subdomains) > int(self.subdomain_limit):
break
yield Request(url, callback=self.parse)
if len(self.subdomains) >= int(self.subdomain_limit):
raise CloseSpider('subdomain limit reached')
示例4
def process_request(self, request, spider):
if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'):
return
proxy = self.proxies.get_random()
if not proxy:
if self.stop_if_no_proxies:
raise CloseSpider("no_proxies")
else:
logger.warn("No proxies available; marking all proxies "
"as unchecked")
self.proxies.reset()
proxy = self.proxies.get_random()
if proxy is None:
logger.error("No proxies available even after a reset.")
raise CloseSpider("no_proxies_after_reset")
request.meta['proxy'] = proxy
request.meta['download_slot'] = self.get_proxy_slot(proxy)
request.meta['_rotating_proxy'] = True
示例5
def parse_article(self,response):
#content,news_no,crawl_date
item = response.meta.get("item",NewsItem())
# news_date = item.get("news_date",None)
# if news_date:
# struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
# delta = self.end_now-struct_date
# print delta.days
# if delta.days == self.end_day:
# raise CloseSpider('today scrapy end')
soup =BeautifulSoup(response.body)
author = soup.find("span",class_="name").text if soup.find("span",class_="name") else None
abstract = soup.find("p",class_="excerpt").text if soup.find("p",class_="excerpt") else None
content = soup.find("div",class_="detail").text if soup.find("div",class_="detail") else None
news_no = response.url.split("/")[-1][:-5]
item["author"] = author
item["abstract"] = abstract
item["content"] = content
item["crawl_date"] = NOW
item["news_no"] = news_no
yield item
示例6
def parse_news(self,response):
item = response.meta.get("item",None)
# #把结束条件移到爬取内容中,以免引起事务的错误
# news_date = item.get("news_date",None)
# if news_date:
# struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
# news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
#
# delta = self.end_now-struct_date
# if delta.days == self.end_day:
# # pass
# raise CloseSpider('today scrapy end')
soup = BeautifulSoup(response.body)
news_content_group = soup.find("div",class_="entry-content group")
#去除相关阅读
news_content_group.find("div",class_="related_posts").replace_with("")
content = news_content_group.text.strip()
item["content"] = content
item["catalogue"] = u"最新内容"
yield item
示例7
def parse(self, response):
self.logger.info(response.url)
if response.text:
for _, meta in formasaurus.extract_forms(response.text):
form_type = meta['form']
if form_type == 'login' and not self.found_login:
self.found_login = True
self.handle_login_form(response.url)
elif form_type == 'registration' \
and not self.found_registration:
self.found_registration = True
self.handle_registration_form(response.url)
if self.found_registration and self.found_login:
raise CloseSpider('done')
for link in self.link_extractor.extract_links(response):
priority = 0
text = ' '.join([relative_url(link.url), link.text]).lower()
if any(pattern in text for pattern in self.priority_patterns):
priority = 100
yield self.request(link.url, self.parse, priority=priority)
示例8
def open_spider(self, spider):
# Called when a spider starts
#Create a dedicated Database Connection for the spider
spider.postgres = postgresSQL()
#Verify the Connection
if spider.postgres.connect() == False:
raise CloseSpider(" Database Connection cannot be established!")
#Initialize the Stats
spider.urls_dropped = 0
spider.urls_scraped = 0
spider.urls_parsed = 0
spider.urls_stored = 0
#Add/Verify Site in Database
self.checkSite(spider)
#Start Spider's Log
spider.log_id = spider.postgres.start_log(spider.custom_settings['site_id'], os.getpid())
if not spider.log_id:
raise CloseSpider(" Unable to Start Log!")
示例9
def checkSite(self, spider):
""" Verifies if site exist in database, add otherwise """
# Verify Database Connection
if not spider.postgres.checkConnection():
logger.error(__name__ + " No Database Connection Found!")
raise CloseSpider(" No Database Connection Found!")
try:
# Check if site Exists in Database using it's site_id
if not spider.postgres.siteExists(spider.custom_settings['site_id']):
# Add it to Database if not
spider.postgres.cursor.execute(spider.postgres.insert_site_str, (
spider.custom_settings['site_id'],
spider.custom_settings['site_name'],
spider.custom_settings['site_url'],
spider.name,
)
)
except Exception as e:
logger.error(__name__ + " Unable to add site to Database! Msg: " + str(e))
raise CloseSpider("Unable to add site to Database")
# Special Methods Below, Read about them before altering
示例10
def spider_closed(self, spider, reason):
# Calls After Spider is closed
# Check Connection
if not spider.postgres.checkConnection():
raise CloseSpider("Unable to Establish a Database Connection")
# Collect all Stats
url_stats = {
"urls_dropped": spider.urls_dropped,
"urls_scraped": spider.urls_scraped,
"urls_parsed": spider.urls_parsed,
"urls_stored": spider.urls_stored
}
# End The Log
if not spider.postgres.end_log(spider.log_id, url_stats, reason):
logger.error(__name__ + " Unable to End Log for Spider " + spider.name + " with stats: " + str(url_stats))
# Close the database connection
spider.postgres.connection.close()
logger.info(__name__ + " [" + spider.name + "] SPIDER CLOSED")
示例11
def parse(self, response):
self.log("PARSING: %s" % response.request.url, level=log.INFO)
items_to_scrape = response.xpath('//*[@id="topic"]/ul[@id="entry-list"]/li')
if len(items_to_scrape) == 0:
self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
level=log.ERROR)
raise CloseSpider('no_item_found')
for sel in items_to_scrape:
girdi_id = sel.xpath('./@data-id').extract()[0]
baslik_id = response.xpath('//*[@id="title"]/a/@href').re(r'--(\d*)')[0]
baslik = response.xpath('//*[@id="title"]/a/span/text()').extract()[0]
date = sel.xpath('./footer/div[@class="info"]/a[@class="entry-date permalink"]/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
text = sel.xpath('string(./div)').extract()[0]
nick = sel.xpath('./footer/div[@class="info"]/a[@class="entry-author"]/text()').extract()[0]
item = Girdi()
item['source'] = self.name
item['baslik'] = baslik
item['girdi_id'] = girdi_id
item['baslik_id'] = baslik_id
item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
item['text'] = text
item['nick'] = nick
yield item
# Sozluk sayfalamayi javascript ile yapiyor, dolayisi ile sayfa linkini XPath ile alamiyoruz ancak kacinci
# sayfada oldugumuz ve son sayfa html icerisinde yer aliyor. Bu bilgileri kullanarak crawl edilecek bir
# sonraki sayfanin adresini belirle. SSG degistirmez umarim :(
current_page = int(response.xpath('//*[@id="topic"]/div[2]/@data-currentpage').extract()[0])
page_count = int(response.xpath('//*[@id="topic"]/div[2]/@data-pagecount').extract()[0])
current_url = response.request.url.split('?p')[0]
next_page = current_page + 1
if page_count >= next_page:
# if current_page < 1:
yield Request('%s?p=%s' % (current_url, next_page))
示例12
def __init__(self, **kwargs):
super(GenericSozlukSpider, self).__init__(**kwargs)
if 'baslik' not in kwargs:
raise CloseSpider('Baslik should be given to scrape')
self.urls = kwargs['baslik'].split(',')
self.allowed_domains = []
示例13
def parse(self, response):
self.log("PARSING: %s" % response.request.url, level=log.INFO)
items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
if len(items_to_scrape) == 0:
self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
level=log.ERROR)
raise CloseSpider('no_item_found')
for sel in items_to_scrape:
girdi_id = sel.xpath('./footer/div[@class="entrymenu"]/@data-info').extract()[0].split(',')[0]
baslik_id = response.xpath('//*[@id="canonical_url"]/@value').re(r'--(\d*)')[0]
baslik = response.xpath('//*[@id="title"]/a/text()').extract()[0]
date = sel.xpath('./footer/div[2]/time/a/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
text = sel.xpath('string(./div)').extract()[0]
nick = sel.css('a.yazarlink').xpath('text()').extract()[0]
item = Girdi()
item['source'] = self.name
item['baslik'] = baslik
item['girdi_id'] = girdi_id
item['baslik_id'] = baslik_id
item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
item['text'] = text
item['nick'] = nick
yield item
current_url = response.request.url.split('/sayfa')[0]
title_re = response.xpath('//title').re(r'sayfa (\d*)')
current_page = int(title_re[0]) if title_re else 1
page_count = int(response.xpath('//a[@rel="last"]')[0].xpath('text()').extract()[0])
next_page = current_page + 1
if page_count >= next_page:
# if current_page < 2:
yield Request('%s/sayfa/%s' % (current_url, next_page))
示例14
def parse(self, response):
self.log("PARSING: %s" % response.request.url, level=log.INFO)
items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
if len(items_to_scrape) == 0:
self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
level=log.ERROR)
raise CloseSpider('no_item_found')
for sel in items_to_scrape:
girdi_id = sel.css('span.voting').css('a.entryid_a').xpath('./span/text()').re(r'#(\d*)')[0]
baslik_id = response.xpath('//*[@id="main"]/div/div[1]/div[1]/div/ul/li[1]/ul/li/a/@onclick').re("'(\d*)'")[0]
baslik = response.css('h1.title').xpath('./a/text()').extract()[0]
date = sel.xpath('.//a[@class="entry_tarih"]/small/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
text = sel.css('div.entry-p').xpath('string(.)').extract()[0]
nick = sel.css('span.entry-author').xpath('./a/text()').extract()[0].lower()
item = Girdi()
item['source'] = self.name
item['baslik'] = baslik
item['girdi_id'] = girdi_id
item['baslik_id'] = baslik_id
item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
item['text'] = text
item['nick'] = nick
yield item
current_page = int(response.css('div.pagination').css('li.active').xpath('./a/text()').extract()[0])
page_count = int(response.xpath('//*[@id="main"]/div/div[3]/ul/li/a')[-2].xpath('text()').extract()[0])
next_page = current_page + 1
# Bir sonraki adimda sayfalama linkini dondurmek icin sayfalamadan onceki baslik adresini cikarmamiz gerek.
# Adres uludagsozluk.com/k/BASLIK/10 seklinde gitmekte. Path'in sayfalamadan onceki kismini al
url_split = urlsplit(response.request.url)
current_baslik_url = '%s://%s%s' % (url_split.scheme, url_split.netloc, '/'.join(url_split.path.split('/')[:3]))
if page_count >= next_page:
# if current_page < 1:
yield Request('%s/%s' % (current_baslik_url, next_page))
示例15
def capture_exceptions(callback):
""" Wrapper for Scrapy callbacks that captures exceptions within
the provided callback and yields it under `exception` property. Also
spider is closed on the first exception. """
def parse(*args, **kwargs):
try:
yield from callback(*args, **kwargs)
except Exception as e:
yield {'exception': e}
raise CloseSpider("Exception in callback detected")
# Mimic type annotations
parse.__annotations__ = callback.__annotations__
return parse
示例16
def __init__(self, IMAGE_STORE, MAXIMUM_IMAGE_NUMBER):
if IMAGE_STORE is None or MAXIMUM_IMAGE_NUMBER is None:
raise CloseSpider('Pipeline load settings failed')
self.IMAGE_STORE = IMAGE_STORE
self.MAXIMUM_IMAGE_NUMBER = MAXIMUM_IMAGE_NUMBER
# recording number of downloaded image
self.image_max_counter = 0
# recording dir name number,it each one thousand add 1
self.dir_counter = 0
示例17
def process_item(self, item, spider):
if item is None:
raise DropItem('Item is null')
dir_path = self.make_dir()
image_final_name = item['image_name'] + '-' + item['image_id'] + '-by@' + item['author'] + '.jpg'
dest_path = os.path.join(dir_path, image_final_name)
self.download_image(item['image_src'], dest_path)
self.image_max_counter += 1
if self.image_max_counter >= self.MAXIMUM_IMAGE_NUMBER:
raise CloseSpider('Current downloaded image already equal maximum number')
return item
示例18
def process_spider_exception(self, response, exception, spider):
if isinstance(exception, HttpError):
if response.status == 456:
# response.meta['fool_blocked'] = True
# return None
raise CloseSpider('catch forbidden,close for a while')
# downloader middleware
示例19
def open_spider(self, spider):
site_setting = spider.settings.get('SITE')
if not site_setting:
error_msg = 'Can not find the website configuration from settings.'
spider.logger.error(error_msg)
raise CloseSpider(error_msg)
self.session = self.session_maker()
site = self.session.query(LiveTVSite).filter(LiveTVSite.code == site_setting['code']).one_or_none()
if not site:
site = LiveTVSite(code=site_setting['code'], name=site_setting['name'],
description=site_setting['description'], url=site_setting['url'],
image=site_setting['image'], show_seq=site_setting['show_seq'])
self.session.add(site)
self.session.commit()
self.site[site.code] = {'id': site.id, 'starttime': datetime.utcnow(), 'channels': {}}
示例20
def parse(self, response):
#首页内容
html = response.body
soup = BeautifulSoup(html,"lxml")
#爬取首页新闻列表
for i in self.fetch_newslist(soup):
# raise CloseSpider(str(i['time'] == u"一天前"))
# if i['time'] == "一天前": raise CloseSpider("today news end")
request = scrapy.Request(i['news_url'],callback=self.parse_news)
request.meta['item'] = i
request.meta['pageindex'] = 1
yield request
#爬取下一页的链接
lasttime = "nothing"
for i in soup.select('div[class="news_li"]'):
if i.attrs.has_key("lasttime"):
lasttime = i["lasttime"]
break
#得到下一个url的连接。
# 格式:load_chosen.jsp?nodeids=25949&topCids=1495258,1494171,1495064,1495130,1495285,&pageidx=
load_chosen = re.search(r'data.:."(.*)".+.masonry',html)
page = 2
if load_chosen :
tp_url = "http://www.thepaper.cn/load_chosen.jsp?%s%s&lastTime=%s" % (load_chosen.group(1),page,lasttime)
yield scrapy.Request(tp_url, callback=self.next_page_parse)
示例21
def parse_news(self,response):
#content,news_date,news_no,crawl_date,referer_web
item = response.meta.get("item",NewsItem())
pageindex = response.meta.get("pageindex",1)
soup = BeautifulSoup(response.body)
# news_date = item.get("news_date",None)
#需要爬取具体的时间
news_date = soup.find("span",class_="arial").text if soup.find("span",class_="arial") else None
#http://info.meadin.com/PictureNews/2938_1.shtml Exception
if news_date:
# struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
# delta = self.end_now-struct_date
# if delta.days == self.end_day:
# raise CloseSpider('today scrapy end')
referer_web = list(soup.find("p",class_="source").strings)[-1] if soup.find("p",class_="source") else None
#爬取正文
art,content = None,None
art = soup.find("div",class_="article js-article")
if art:
#剔除摘要!
art.find("div",class_="intro").replace_with("")
content =art.text.strip()
news_no =response.url.split("/")[-1].split("_")[0]
item["news_date"]=news_date
item["content"]=content
item["referer_web"]=referer_web
item["crawl_date"]=NOW
item["news_no"]=news_no
item = judge_news_crawl(item)
if item:
yield item
else:
self.flag = pageindex
else:
logger.warning("can't find news_date.the url is %s" % response.url)
示例22
def __check_for_close(self):
"""
Check to see if this spider has been running for longer than the maximum amount
of allowed time, and stop the spider if it has.
:return: None
"""
if self._start_time is None:
self._start_time = DatetimeHelper.now()
elapsed_time = (DatetimeHelper.now() - self.start_time).total_seconds()
if elapsed_time > self.max_run_time:
raise CloseSpider(
"Spider run time exceeded maximum time of %s seconds. Closing."
% (self.max_run_time,)
)
示例23
def build_check_recipient(self, ip, port, scheme,
user=None, password=None):
"""
1. build a request for availability checking
2. drop it if already existed
:return: Request
"""
if self.complete_condition():
raise exceptions.CloseSpider('Enough items')
spec = dict(ip=ip, port=port, scheme=scheme)
if not valid_format(spec):
self.logger.debug('Got wrong format (%s, %s). Clear it.' % (ip, port))
return {}
if self.already_exists(spec):
self.logger.debug('Got duplicated %s. Clear it.' % spec.values())
return {} # drop it
proxy_url = utils.build_proxy_url(ip, port, scheme, user, password)
need_auth = int(bool(user and password))
item = Proxy(
ip=ip,
scheme=scheme,
port=port,
need_auth=need_auth,
url=proxy_url,
)
if need_auth:
item['user'], item['password'] = user, password
self.logger.debug('Got unchecked %s' % item)
return self.build_check_request(item)
示例24
def process_item(self, item, spider):
if not spider.postgres.checkConnection():
raise CloseSpider("Unable to Establish a Database Connection")
if spider.postgres.checkUrlExists(item['link']):
raise DropItem("Url " + item['link'] + " Exists in Database")
return item
示例25
def parse_news(self,response):
# print response.url,"response"
PageKey = response.meta.get("topic_id")
PageNumber =response.meta.get("PageNumber")
flag_id =str(int(PageKey)-40037910)
soup =BeautifulSoup(response.body,"lxml")
#2016-07-13
news_date = soup.find("time").text if soup.find("time") else None
# print self.flag[flag_id],int(PageNumber)
"""
条件是该类别标记(self.flag[flag_id])是0爬取,说明还没有爬到过期的。
爬取页面是该页的也继续爬取。因为一个页面的爬取顺序是异步的。
self.flag[flag_id]=过期页数
"""
if not self.flag[flag_id] or int(PageNumber)==self.flag[flag_id]:
#,没有超出范围
struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
# print self.end_now,struct_date,"time"
delta = self.end_now-struct_date
# print delta.days,"delta day ~~~~~~~~~~~~~~~~"
if delta.days > self.end_day:
self.flag[str(flag_id)]=int(PageNumber)
# print flag_id,"stop ~~~~~~"
# raise CloseSpider('today scrapy end')
else:
head = soup.find("div",class_="post-head")
topic,title,abstract=None,None,None
if head:
topic = head.find("span",class_="category").text if head.find("span",class_="category") else None
title =head.find("h1",class_="h1").text if head.find("h1",class_="h1") else None
abstract = head.find("span",class_="kicker").text if head.find("span",class_="kicker") else None
content = soup.find("div",class_="post-body clearfix").text if soup.find("div",class_="post-body clearfix") else None
news_no = response.url.split("/")[-1].split("?")[0]
#TODO 评论数量js渲染,未解决
item = NewsItem(title=title,topic=topic,
abstract=abstract,news_date=news_date,
content=content,news_no=news_no
,crawl_date=NOW,news_url=response.url,catalogue='新闻板块')
yield item