Python源码示例:scrapy.exceptions.CloseSpider()

示例1
def handle_spider_error(self, _failure, request, response, spider):
        exc = _failure.value
        if isinstance(exc, CloseSpider):
            self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
            return
        logger.error(
            "Spider error processing %(request)s (referer: %(referer)s)",
            {'request': request, 'referer': referer_str(request)},
            exc_info=failure_to_exc_info(_failure),
            extra={'spider': spider}
        )
        self.signals.send_catch_log(
            signal=signals.spider_error,
            failure=_failure, response=response,
            spider=spider
        )
        self.crawler.stats.inc_value(
            "spider_exceptions/%s" % _failure.value.__class__.__name__,
            spider=spider
        ) 
示例2
def handle_spider_error(self, _failure, request, response, spider):
        exc = _failure.value
        if isinstance(exc, CloseSpider):
            self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
            return
        logger.error(
            "Spider error processing %(request)s (referer: %(referer)s)",
            {'request': request, 'referer': referer_str(request)},
            exc_info=failure_to_exc_info(_failure),
            extra={'spider': spider}
        )
        self.signals.send_catch_log(
            signal=signals.spider_error,
            failure=_failure, response=response,
            spider=spider
        )
        self.crawler.stats.inc_value(
            "spider_exceptions/%s" % _failure.value.__class__.__name__,
            spider=spider
        ) 
示例3
def parse_item(self, response):
        item = InventusSpiderItem()
        for url in Selector(text=response.body).xpath('//a/@href').extract():
            if not url.startswith('http://') or url.startswith('https://'):
                url = self.base_url + url
            try:
                parsed_uri = urlparse(url)
            except ValueError:
                # If the URL is invalid we can ignore it.
                continue
            if parsed_uri.netloc.endswith('.' + self.domain) and 'mailto:' not in url:
                if not parsed_uri.netloc in self.subdomains:
                    self.subdomains.append(parsed_uri.netloc)
                    item['subdomain'] = parsed_uri.netloc
                    yield item

                    if len(self.subdomains) > int(self.subdomain_limit):
                        break

                yield Request(url, callback=self.parse)

        if len(self.subdomains) >= int(self.subdomain_limit):
            raise CloseSpider('subdomain limit reached') 
示例4
def process_request(self, request, spider):
        if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'):
            return
        proxy = self.proxies.get_random()
        if not proxy:
            if self.stop_if_no_proxies:
                raise CloseSpider("no_proxies")
            else:
                logger.warn("No proxies available; marking all proxies "
                            "as unchecked")
                self.proxies.reset()
                proxy = self.proxies.get_random()
                if proxy is None:
                    logger.error("No proxies available even after a reset.")
                    raise CloseSpider("no_proxies_after_reset")

        request.meta['proxy'] = proxy
        request.meta['download_slot'] = self.get_proxy_slot(proxy)
        request.meta['_rotating_proxy'] = True 
示例5
def parse_article(self,response):
        #content,news_no,crawl_date
        item = response.meta.get("item",NewsItem())
        # news_date = item.get("news_date",None)
        # if news_date:
        #     struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
        #     delta = self.end_now-struct_date
        #     print delta.days
        #     if delta.days == self.end_day:
        #         raise CloseSpider('today scrapy end')
        soup =BeautifulSoup(response.body)
        author = soup.find("span",class_="name").text if soup.find("span",class_="name") else None
        abstract =  soup.find("p",class_="excerpt").text if soup.find("p",class_="excerpt") else None
        content = soup.find("div",class_="detail").text if soup.find("div",class_="detail") else None
        news_no = response.url.split("/")[-1][:-5]
        item["author"] = author
        item["abstract"] = abstract
        item["content"] = content
        item["crawl_date"] = NOW
        item["news_no"] = news_no
        yield item 
示例6
def parse_news(self,response):
        item = response.meta.get("item",None)
        # #把结束条件移到爬取内容中,以免引起事务的错误
        # news_date = item.get("news_date",None)
        # if news_date:
        #     struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
        #     news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
        #
        #     delta = self.end_now-struct_date
        #     if delta.days == self.end_day:
        #         # pass
        #         raise CloseSpider('today scrapy end')
        soup = BeautifulSoup(response.body)
        news_content_group = soup.find("div",class_="entry-content group")
        #去除相关阅读
        news_content_group.find("div",class_="related_posts").replace_with("")
        content = news_content_group.text.strip()
        item["content"] = content
        item["catalogue"] = u"最新内容"
        yield item 
示例7
def parse(self, response):
        self.logger.info(response.url)
        if response.text:
            for _, meta in formasaurus.extract_forms(response.text):
                form_type = meta['form']
                if form_type == 'login' and not self.found_login:
                    self.found_login = True
                    self.handle_login_form(response.url)
                elif form_type == 'registration' \
                        and not self.found_registration:
                    self.found_registration = True
                    self.handle_registration_form(response.url)
        if self.found_registration and self.found_login:
            raise CloseSpider('done')
        for link in self.link_extractor.extract_links(response):
            priority = 0
            text = ' '.join([relative_url(link.url), link.text]).lower()
            if any(pattern in text for pattern in self.priority_patterns):
                priority = 100
            yield self.request(link.url, self.parse, priority=priority) 
示例8
def open_spider(self, spider):
        # Called when a spider starts

        #Create a dedicated Database Connection for the spider
        spider.postgres = postgresSQL()

        #Verify the Connection
        if spider.postgres.connect() == False:
            raise CloseSpider(" Database Connection cannot be established!")

        #Initialize the Stats
        spider.urls_dropped = 0
        spider.urls_scraped = 0
        spider.urls_parsed = 0
        spider.urls_stored = 0

        #Add/Verify Site in Database
        self.checkSite(spider)

        #Start Spider's Log
        spider.log_id = spider.postgres.start_log(spider.custom_settings['site_id'], os.getpid())
        if not spider.log_id:
            raise CloseSpider(" Unable to Start Log!") 
示例9
def checkSite(self, spider):
        """ Verifies if site exist in database, add otherwise """
        # Verify Database Connection
        if not spider.postgres.checkConnection():
            logger.error(__name__ + " No Database Connection Found!")
            raise CloseSpider(" No Database Connection Found!")
        
        try:
            # Check if site Exists in Database using it's site_id
            if not spider.postgres.siteExists(spider.custom_settings['site_id']):
                # Add it to Database if not
                spider.postgres.cursor.execute(spider.postgres.insert_site_str, (
                    spider.custom_settings['site_id'],
                    spider.custom_settings['site_name'],
                    spider.custom_settings['site_url'],
                    spider.name,
                    )
                )
        except Exception as e:
            logger.error(__name__ + " Unable to add site to Database! Msg: " + str(e))
            raise CloseSpider("Unable to add site to Database")

    # Special Methods Below, Read about them before altering 
示例10
def spider_closed(self, spider, reason):
        # Calls After Spider is closed

        # Check Connection
        if not spider.postgres.checkConnection():
            raise CloseSpider("Unable to Establish a Database Connection")

        # Collect all Stats
        url_stats = {
            "urls_dropped": spider.urls_dropped,
            "urls_scraped": spider.urls_scraped,
            "urls_parsed": spider.urls_parsed,
            "urls_stored": spider.urls_stored
        }

        # End The Log
        if not spider.postgres.end_log(spider.log_id, url_stats, reason):
            logger.error(__name__ + " Unable to End Log for Spider " + spider.name + " with stats: " + str(url_stats))
        
        # Close the database connection
        spider.postgres.connection.close()
        logger.info(__name__ + " [" + spider.name + "] SPIDER CLOSED") 
示例11
def parse(self, response):
        self.log("PARSING: %s" % response.request.url, level=log.INFO)

        items_to_scrape = response.xpath('//*[@id="topic"]/ul[@id="entry-list"]/li')
        if len(items_to_scrape) == 0:
            self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
                     level=log.ERROR)
            raise CloseSpider('no_item_found')

        for sel in items_to_scrape:
            girdi_id = sel.xpath('./@data-id').extract()[0]
            baslik_id = response.xpath('//*[@id="title"]/a/@href').re(r'--(\d*)')[0]
            baslik = response.xpath('//*[@id="title"]/a/span/text()').extract()[0]
            date = sel.xpath('./footer/div[@class="info"]/a[@class="entry-date permalink"]/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
            text = sel.xpath('string(./div)').extract()[0]
            nick = sel.xpath('./footer/div[@class="info"]/a[@class="entry-author"]/text()').extract()[0]

            item = Girdi()
            item['source'] = self.name
            item['baslik'] = baslik
            item['girdi_id'] = girdi_id
            item['baslik_id'] = baslik_id
            item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
            item['text'] = text
            item['nick'] = nick

            yield item

        # Sozluk sayfalamayi javascript ile yapiyor, dolayisi ile sayfa linkini XPath ile alamiyoruz ancak kacinci
        # sayfada oldugumuz ve son sayfa html icerisinde yer aliyor. Bu bilgileri kullanarak crawl edilecek bir
        # sonraki sayfanin adresini belirle. SSG degistirmez umarim :(
        current_page = int(response.xpath('//*[@id="topic"]/div[2]/@data-currentpage').extract()[0])
        page_count = int(response.xpath('//*[@id="topic"]/div[2]/@data-pagecount').extract()[0])

        current_url = response.request.url.split('?p')[0]

        next_page = current_page + 1
        if page_count >= next_page:
        # if current_page < 1:
            yield Request('%s?p=%s' % (current_url, next_page)) 
示例12
def __init__(self, **kwargs):
        super(GenericSozlukSpider, self).__init__(**kwargs)

        if 'baslik' not in kwargs:
            raise CloseSpider('Baslik should be given to scrape')

        self.urls = kwargs['baslik'].split(',')
        self.allowed_domains = [] 
示例13
def parse(self, response):
        self.log("PARSING: %s" % response.request.url, level=log.INFO)

        items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
        if len(items_to_scrape) == 0:
            self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
                     level=log.ERROR)
            raise CloseSpider('no_item_found')

        for sel in items_to_scrape:
            girdi_id = sel.xpath('./footer/div[@class="entrymenu"]/@data-info').extract()[0].split(',')[0]
            baslik_id = response.xpath('//*[@id="canonical_url"]/@value').re(r'--(\d*)')[0]
            baslik = response.xpath('//*[@id="title"]/a/text()').extract()[0]
            date = sel.xpath('./footer/div[2]/time/a/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
            text = sel.xpath('string(./div)').extract()[0]
            nick = sel.css('a.yazarlink').xpath('text()').extract()[0]

            item = Girdi()
            item['source'] = self.name
            item['baslik'] = baslik
            item['girdi_id'] = girdi_id
            item['baslik_id'] = baslik_id
            item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
            item['text'] = text
            item['nick'] = nick

            yield item

        current_url = response.request.url.split('/sayfa')[0]

        title_re = response.xpath('//title').re(r'sayfa (\d*)')
        current_page = int(title_re[0]) if title_re else 1

        page_count = int(response.xpath('//a[@rel="last"]')[0].xpath('text()').extract()[0])

        next_page = current_page + 1
        if page_count >= next_page:
        # if current_page < 2:
            yield Request('%s/sayfa/%s' % (current_url, next_page)) 
示例14
def parse(self, response):
        self.log("PARSING: %s" % response.request.url, level=log.INFO)

        items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
        if len(items_to_scrape) == 0:
            self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
                     level=log.ERROR)
            raise CloseSpider('no_item_found')

        for sel in items_to_scrape:
            girdi_id = sel.css('span.voting').css('a.entryid_a').xpath('./span/text()').re(r'#(\d*)')[0]
            baslik_id = response.xpath('//*[@id="main"]/div/div[1]/div[1]/div/ul/li[1]/ul/li/a/@onclick').re("'(\d*)'")[0]
            baslik = response.css('h1.title').xpath('./a/text()').extract()[0]
            date = sel.xpath('.//a[@class="entry_tarih"]/small/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
            text = sel.css('div.entry-p').xpath('string(.)').extract()[0]
            nick = sel.css('span.entry-author').xpath('./a/text()').extract()[0].lower()

            item = Girdi()
            item['source'] = self.name
            item['baslik'] = baslik
            item['girdi_id'] = girdi_id
            item['baslik_id'] = baslik_id
            item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
            item['text'] = text
            item['nick'] = nick

            yield item

        current_page = int(response.css('div.pagination').css('li.active').xpath('./a/text()').extract()[0])
        page_count = int(response.xpath('//*[@id="main"]/div/div[3]/ul/li/a')[-2].xpath('text()').extract()[0])
        next_page = current_page + 1

        # Bir sonraki adimda sayfalama linkini dondurmek icin sayfalamadan onceki baslik adresini cikarmamiz gerek.
        # Adres uludagsozluk.com/k/BASLIK/10 seklinde gitmekte. Path'in sayfalamadan onceki kismini al
        url_split = urlsplit(response.request.url)
        current_baslik_url = '%s://%s%s' % (url_split.scheme, url_split.netloc, '/'.join(url_split.path.split('/')[:3]))

        if page_count >= next_page:
        # if current_page < 1:
            yield Request('%s/%s' % (current_baslik_url, next_page)) 
示例15
def capture_exceptions(callback):
    """ Wrapper for Scrapy callbacks that captures exceptions within
    the provided callback and yields it under `exception` property. Also
    spider is closed on the first exception. """
    def parse(*args, **kwargs):
        try:
            yield from callback(*args, **kwargs)
        except Exception as e:
            yield {'exception': e}
            raise CloseSpider("Exception in callback detected")
    # Mimic type annotations
    parse.__annotations__ = callback.__annotations__
    return parse 
示例16
def __init__(self, IMAGE_STORE, MAXIMUM_IMAGE_NUMBER):
        if IMAGE_STORE is None or MAXIMUM_IMAGE_NUMBER is None:
            raise CloseSpider('Pipeline load settings failed')
        self.IMAGE_STORE = IMAGE_STORE
        self.MAXIMUM_IMAGE_NUMBER = MAXIMUM_IMAGE_NUMBER
        # recording number of downloaded image
        self.image_max_counter = 0
        # recording dir name number,it each one thousand add 1
        self.dir_counter = 0 
示例17
def process_item(self, item, spider):
        if item is None:
            raise DropItem('Item is null')
        dir_path = self.make_dir()
        image_final_name = item['image_name'] + '-' + item['image_id'] + '-by@' + item['author'] + '.jpg'
        dest_path = os.path.join(dir_path, image_final_name)
        self.download_image(item['image_src'], dest_path)
        self.image_max_counter += 1
        if self.image_max_counter >= self.MAXIMUM_IMAGE_NUMBER:
            raise CloseSpider('Current downloaded image already equal maximum number')
        return item 
示例18
def process_spider_exception(self, response, exception, spider):
        if isinstance(exception, HttpError):
            if response.status == 456:
                # response.meta['fool_blocked'] = True
                # return None
                raise CloseSpider('catch forbidden,close for a while')


# downloader middleware 
示例19
def open_spider(self, spider):
        site_setting = spider.settings.get('SITE')
        if not site_setting:
            error_msg = 'Can not find the website configuration from settings.'
            spider.logger.error(error_msg)
            raise CloseSpider(error_msg)
        self.session = self.session_maker()
        site = self.session.query(LiveTVSite).filter(LiveTVSite.code == site_setting['code']).one_or_none()
        if not site:
            site = LiveTVSite(code=site_setting['code'], name=site_setting['name'],
                              description=site_setting['description'], url=site_setting['url'],
                              image=site_setting['image'], show_seq=site_setting['show_seq'])
            self.session.add(site)
            self.session.commit()
        self.site[site.code] = {'id': site.id, 'starttime': datetime.utcnow(), 'channels': {}} 
示例20
def parse(self, response):
        #首页内容
        html = response.body
        soup = BeautifulSoup(html,"lxml")
        #爬取首页新闻列表
        for i in self.fetch_newslist(soup):
            # raise CloseSpider(str(i['time'] == u"一天前"))
            # if i['time'] == "一天前": raise CloseSpider("today news end")
            request = scrapy.Request(i['news_url'],callback=self.parse_news)
            request.meta['item'] = i
            request.meta['pageindex'] = 1
            yield request

        #爬取下一页的链接
        lasttime = "nothing"
        for i in  soup.select('div[class="news_li"]'):
            if i.attrs.has_key("lasttime"):
                lasttime =  i["lasttime"]
                break
        #得到下一个url的连接。
        # 格式:load_chosen.jsp?nodeids=25949&topCids=1495258,1494171,1495064,1495130,1495285,&pageidx=
        load_chosen = re.search(r'data.:."(.*)".+.masonry',html)
        page = 2
        if load_chosen :
            tp_url = "http://www.thepaper.cn/load_chosen.jsp?%s%s&lastTime=%s" % (load_chosen.group(1),page,lasttime)
            yield scrapy.Request(tp_url, callback=self.next_page_parse) 
示例21
def parse_news(self,response):
        #content,news_date,news_no,crawl_date,referer_web
        item = response.meta.get("item",NewsItem())
        pageindex = response.meta.get("pageindex",1)
        soup = BeautifulSoup(response.body)
        # news_date = item.get("news_date",None)
        #需要爬取具体的时间
        news_date = soup.find("span",class_="arial").text if soup.find("span",class_="arial") else None
        #http://info.meadin.com/PictureNews/2938_1.shtml Exception
        if news_date:

            # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
            # delta = self.end_now-struct_date
            # if delta.days == self.end_day:
            #     raise CloseSpider('today scrapy end')
            referer_web = list(soup.find("p",class_="source").strings)[-1] if soup.find("p",class_="source") else None
            #爬取正文
            art,content = None,None
            art = soup.find("div",class_="article js-article")
            if art:
                #剔除摘要!
                art.find("div",class_="intro").replace_with("")
                content =art.text.strip()
            news_no =response.url.split("/")[-1].split("_")[0]
            item["news_date"]=news_date
            item["content"]=content
            item["referer_web"]=referer_web
            item["crawl_date"]=NOW
            item["news_no"]=news_no
            item = judge_news_crawl(item)
            if item:
                yield item
            else:
                self.flag = pageindex
        else:
            logger.warning("can't find news_date.the url is %s" % response.url) 
示例22
def __check_for_close(self):
        """
        Check to see if this spider has been running for longer than the maximum amount
        of allowed time, and stop the spider if it has.
        :return: None
        """
        if self._start_time is None:
            self._start_time = DatetimeHelper.now()
        elapsed_time = (DatetimeHelper.now() - self.start_time).total_seconds()
        if elapsed_time > self.max_run_time:
            raise CloseSpider(
                "Spider run time exceeded maximum time of %s seconds. Closing."
                % (self.max_run_time,)
            ) 
示例23
def build_check_recipient(self, ip, port, scheme,
                              user=None, password=None):
        """
        1. build a request for availability checking
        2. drop it if already existed

        :return: Request
        """

        if self.complete_condition():
            raise exceptions.CloseSpider('Enough items')

        spec = dict(ip=ip, port=port, scheme=scheme)

        if not valid_format(spec):
            self.logger.debug('Got wrong format (%s, %s). Clear it.' % (ip, port))

            return {}

        if self.already_exists(spec):
            self.logger.debug('Got duplicated %s. Clear it.' % spec.values())

            return {}  # drop it

        proxy_url = utils.build_proxy_url(ip, port, scheme, user, password)
        need_auth = int(bool(user and password))
        item = Proxy(
            ip=ip,
            scheme=scheme,
            port=port,
            need_auth=need_auth,
            url=proxy_url,
        )

        if need_auth:
            item['user'], item['password'] = user, password

        self.logger.debug('Got unchecked %s' % item)

        return self.build_check_request(item) 
示例24
def process_item(self, item, spider):

        if not spider.postgres.checkConnection():
            raise CloseSpider("Unable to Establish a Database Connection")
        
        if spider.postgres.checkUrlExists(item['link']):
            raise DropItem("Url " + item['link'] + " Exists in Database")
        
        return item 
示例25
def parse_news(self,response):
        # print response.url,"response"
        PageKey = response.meta.get("topic_id")
        PageNumber =response.meta.get("PageNumber")
        flag_id =str(int(PageKey)-40037910)
        soup =BeautifulSoup(response.body,"lxml")
        #2016-07-13
        news_date = soup.find("time").text if soup.find("time") else None
        # print self.flag[flag_id],int(PageNumber)
        """
        条件是该类别标记(self.flag[flag_id])是0爬取,说明还没有爬到过期的。
        爬取页面是该页的也继续爬取。因为一个页面的爬取顺序是异步的。
        self.flag[flag_id]=过期页数
        """
        if not self.flag[flag_id] or int(PageNumber)==self.flag[flag_id]:
            #,没有超出范围


            struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
            # print self.end_now,struct_date,"time"
            delta = self.end_now-struct_date
            # print delta.days,"delta day ~~~~~~~~~~~~~~~~"
            if delta.days > self.end_day:
                self.flag[str(flag_id)]=int(PageNumber)
                # print flag_id,"stop ~~~~~~"
                # raise CloseSpider('today scrapy end')
            else:

                head = soup.find("div",class_="post-head")
                topic,title,abstract=None,None,None
                if head:
                    topic = head.find("span",class_="category").text if head.find("span",class_="category") else None
                    title =head.find("h1",class_="h1").text if head.find("h1",class_="h1") else None
                    abstract = head.find("span",class_="kicker").text if head.find("span",class_="kicker") else None
                content = soup.find("div",class_="post-body clearfix").text if soup.find("div",class_="post-body clearfix") else None
                news_no = response.url.split("/")[-1].split("?")[0]
                #TODO 评论数量js渲染,未解决
                item = NewsItem(title=title,topic=topic,
                                abstract=abstract,news_date=news_date,
                                content=content,news_no=news_no
                                ,crawl_date=NOW,news_url=response.url,catalogue='新闻板块')
                yield item