Python源码示例:scrapy.exceptions.DropItem()

示例1
def process_item(self, item, spider):
        if spider.name not in ['meituan']:
            return item
        if self.filter_dic.get(item['restaurant_name']) == item['address']:
            print(item['restaurant_name'])
            print(item['address'])
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.filter_dic[item['restaurant_name']] = item['address']
            try:
                item['lng'], item['lat'] = gaode_to_baidu(float(item['lng']), float(item['lat']))
                item['province_code'] = pinyin.get(item['province'])
                item['city_code'] = pinyin.get(item['city'])
                item['region_code'] = pinyin.get(item['region'])
                item['area_code'] = pinyin.get(item['area'])
            except BaseException as e:
                print(e)
            return item 
示例2
def item_completed(self, results, item, info):
        result = {}
        for n, r in enumerate(results):
            ok, x = r
            if ok:
                result[x["url"]] = x["path"]
            else:
                result[item[self.URLS_NAME][n]] = x.getErrorMessage()
        # TODO: Save the result

        # file_paths = [x['path'] for ok, x in results if ok]
        # if not file_paths:
        #     raise DropItem("Item contains no files")
        # item['image_paths'] = file_paths
        # return item

        return super(GroupDownPipelineMinix, self).item_completed(results, item, info) 
示例3
def process_item(self, item, spider):
        if not isinstance(item, JsonSchemaItem):
            return item

        errors = list(item.validator.iter_errors(dict(item)))
        paths_messages = []
        for error in errors:
            absolute_path = list(error.absolute_path)
            # error path is not available when required field is not filled
            # so we parse error message. Nasty.
            required_match = self.REQUIRED_RE.search(error.message)
            if required_match:
                absolute_path.append(required_match.group(1))
            path = '.'.join(map(str, absolute_path))
            self.stats.inc_value(self.STAT_FMT.format(field=path))
            paths_messages.append((path, error.message))
        if errors:
            error_msg = ''
            for path, message in paths_messages:
                error_msg += u'{}: {}\n'.format(path, message)
            raise DropItem(u'schema validation failed: \n {}'.format(error_msg))

        return item 
示例4
def _itemproc_finished(self, output, item, response, spider):
        """ItemProcessor finished for the given ``item`` and returned ``output``
        """
        self.slot.itemproc_size -= 1
        if isinstance(output, Failure):
            ex = output.value
            if isinstance(ex, DropItem):
                logkws = self.logformatter.dropped(item, ex, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_dropped, item=item, response=response,
                    spider=spider, exception=output.value)
            else:
                logger.error('Error processing %(item)s', {'item': item},
                             exc_info=failure_to_exc_info(output),
                             extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_error, item=item, response=response,
                    spider=spider, failure=output)
        else:
            logkws = self.logformatter.scraped(output, response, spider)
            logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
            return self.signals.send_catch_log_deferred(
                signal=signals.item_scraped, item=output, response=response,
                spider=spider) 
示例5
def _itemproc_finished(self, output, item, response, spider):
        """ItemProcessor finished for the given ``item`` and returned ``output``
        """
        self.slot.itemproc_size -= 1
        if isinstance(output, Failure):
            ex = output.value
            if isinstance(ex, DropItem):
                logkws = self.logformatter.dropped(item, ex, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_dropped, item=item, response=response,
                    spider=spider, exception=output.value)
            else:
                logger.error('Error processing %(item)s', {'item': item},
                             exc_info=failure_to_exc_info(output),
                             extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_error, item=item, response=response,
                    spider=spider, failure=output)
        else:
            logkws = self.logformatter.scraped(output, response, spider)
            logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
            return self.signals.send_catch_log_deferred(
                signal=signals.item_scraped, item=output, response=response,
                spider=spider) 
示例6
def process_item(self, item, spider):
        item['organization'] = spider.organization
        if 'event_time' in item:
            item['event_time']['date_format'] = spider.date_format
        loader = EventLoader(**item)
        # see if there is a custom filter for the item
        if not spider.item_filter(item):
            raise DropItem('Custom item filter did not allow this event')
        if 'event_time' in loader.item:
            time = loader.item['event_time']
            if self.time_utils.time_range_is_between(time['start_timestamp'], time['end_timestamp'], spider.start_timestamp, spider.end_timestamp):
                return loader.item
            else:
                raise DropItem('Event is not in the configured timeframe')
        else:
            return loader.item 
示例7
def process_exception(self, request, exception, spider):
        if isinstance(exception, (IgnoreRequest, DropItem)):
            return
        if not self._is_enabled_for_request(request):
            return

        autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)
        stop_time = time.time()
        latency = time.time() - autoextract['timing']['start_ts']
        autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})

        # Make sure to log all unknown failures
        logger.warning('AutoExtract failure after %.3fs for %s: %s',
                       latency,
                       autoextract['original_url'],
                       repr(exception),
                       extra={'spider': spider})

        request.meta['autoextract'] = autoextract
        ex_class = global_object_name(exception.__class__)
        self.inc_metric('autoextract/errors/total_count', spider=spider)
        self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider) 
示例8
def process_item(self, item, spider):
        if spider.name in ['RssCrawler', 'GdeltCrawler']:
            # Search the CurrentVersion table for a version of the article
            try:
                self.cursor.execute(self.compare_versions, (item['url'],))
            except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError,
                    pymysql.IntegrityError, TypeError) as error:
                self.log.error("Something went wrong in rss query: %s", error)

            # Save the result of the query. Must be done before the add,
            #   otherwise the result will be overwritten in the buffer
            old_version = self.cursor.fetchone()

            if old_version is not None and (datetime.datetime.strptime(
                    item['download_date'], "%y-%m-%d %H:%M:%S") -
                                            old_version[3]) \
                    < datetime.timedelta(hours=self.delta_time):
                # Compare the two download dates. index 3 of old_version
                # corresponds to the download_date attribute in the DB
                raise DropItem("Article in DB too recent. Not saving.")

        return item 
示例9
def process_item(self, item, spider):
        def raise_if_missing(name, item):
            if name not in item:
                raise DropItem(
                    'The required field "{}" is missing in: {}.'.format(name, item)
                )

        # Required fields for all items
        for required in ("id", "title", "link"):
            raise_if_missing(required, item)

        # Required fields for FeedEntryItems
        if isinstance(item, FeedEntryItem):
            for required in ("updated",):
                raise_if_missing(required, item)

        return item 
示例10
def parse(self, response):
        """
        Default callback function with response for the crawled url
        https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse
        """
        response = response.replace(body=re.sub(r"<br\s*[\/]?>", "\n", response.body.decode('utf=8')))
        property_key = response.url.split('=')[1].replace('&', '')
        # logging.debug("Parsing property_key: %s", property_key)
        if 'No Data at this time' in response.text:
            msg = "No data for " + response.url
            logging.warning(msg)
            raise DropItem(msg)
        else:
            property_info = self.parse_property_info(response)
            property_values = self.parse_property_values(response)
            property_sales = self.parse_property_sales(response)
            property_info['sales'] = property_sales
            property_info['values'] = property_values
            property_info['property_key'] = property_key
            yield Property(property_info) 
示例11
def process_item(self, item, spider):
        """Main function that process URL item (first phase)."""
        # validate URL length
        if len(item['raw']) > MAX_URL_LEN:
            item['raw'] = item['raw'][:MAX_URL_LEN]
            logger.error('Raw URL too long, trucate it! %r', item['raw'])
        # parse raw URL
        purl = get_parsed_url(item['raw'])
        if purl is None or purl.hostname is None:
            raise DropItem('Invalide URL')
        site_id = belongs_to_site(purl.hostname, self.site_tuples)
        if site_id is None:
            raise DropItem('Offsite domain: %s', item)
        item['site_id'] = site_id
        # insert URL into table
        try:
            get_or_create_murl(spider.session, item, spider.platform_id)
        except SQLAlchemyError as e:
            logger.error(e)
            spider.session.rollback()
            raise DropItem('Fail to insert database of url: %s', item)
        return item 
示例12
def checkInvalidKeys(self, item):
        """ Checks Keys For Invalid Entries Such as None/Empty """        
        allowedKeys = {
            'None': ["image"],
            'Empty': ["image"]
        }
        for key in item:
            try:
                if (item[key] == None or item[key] == "Error") and key not in allowedKeys['None']:
                    raise DropItem("Required Key " + str(key) + " is None")

                if(type(item[key]) is str and key not in allowedKeys['Empty']):
                    if len(item[key]) == 0:
                        raise DropItem("Required Key " + str(key) + " is Empty")
            except DropItem:
                pass
            except Exception as e:
                logger.error(__name__ + " Exception: " + str(e))
                continue 
示例13
def _process_verified_item(self, item, spider):
        if item['incr'] == '-inf' or item['incr'] < 0:
            raise DropItem('item verification has failed')

        self.redis_con.zadd(item['queue'], item['verified_time'], item['url']) 
示例14
def _process_speed_item(self, item, spider):
        if item['incr'] == '-inf' or item['incr'] < 0:
            raise DropItem('item verification has failed')

        self.redis_con.zadd(item['queue'], item['response_time'], item['url']) 
示例15
def process_item(self, item, spider):
		if item['level'] == 2 and item['set_name'] is None:
			raise DropItem('set name is empty')
		return item 
示例16
def process_item(self, item, spider):
		if item['level'] == 1 and self.__insert_vedio(item):
			return item
		elif item['level'] == 2 and self.__update_vedio(item):
			return item
		raise DropItem('fail to store data') 
示例17
def get_media_requests(self, item, info):
        # check for mandatory fields
        for x in ["vendor", "url"]:
            if x not in item:
                raise DropItem(
                    "Missing required field '%s' for item: " % (x, item))

        # resolve dynamic redirects in urls
        for x in ["mib", "sdk", "url"]:
            if x in item:
                split = urlparse.urlsplit(item[x])
                # remove username/password if only one provided
                if split.username or split.password and not (split.username and split.password):
                    item[x] = urlparse.urlunsplit(
                        (split[0], split[1][split[1].find("@") + 1:], split[2], split[3], split[4]))

                if split.scheme == "http":
                    item[x] = urllib.urlopen(item[x]).geturl()

        # check for filtered url types in path
        url = urlparse.urlparse(item["url"])
        if any(url.path.endswith(x) for x in [".pdf", ".php", ".txt", ".doc", ".rtf", ".docx", ".htm", ".html", ".md5", ".sha1", ".torrent"]):
            raise DropItem("Filtered path extension: %s" % url.path)
        elif any(x in url.path for x in ["driver", "utility", "install", "wizard", "gpl", "login"]):
            raise DropItem("Filtered path type: %s" % url.path)

        # generate list of url's to download
        item[self.files_urls_field] = [item[x]
                                       for x in ["mib", "url"] if x in item]

        # pass vendor so we can generate the correct file path and name
        return [Request(x, meta={"ftp_user": "anonymous", "ftp_password": "chrome@example.com", "vendor": item["vendor"]}) for x in item[self.files_urls_field]]

    # overrides function from FilesPipeline 
示例18
def _drop_item(self, item, errors):
        """
        This method drops the item after detecting validation errors. Note
        that you could override it to add more details about the item that
        is being dropped or to drop the item only when some specific errors
        are detected.
        """
        self.stats.add_dropped_item()
        raise DropItem("Validation failed!") 
示例19
def process_item(self, item, spider):
        if (isinstance(item, BusinessItem) or isinstance(item, ReviewItem)
                or isinstance(item, UserItem)):
            what = item_type(item)
            key = '{0}_id'.format(what)
            value = item[key]
            if value in self.seen_items[what]:
                raise DropItem('Duplicate {0} found: {1}'.format(what, item))
            else:
                self.seen_items[what].add(value)
                return item


# Shamelessly copied from http://stackoverflow.com/q/12230332 
示例20
def process_item(self, item, spider):
        if item['pid'] in self.ids_seen:
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['pid'])
            return item 
示例21
def process_item(self, item, spider):

        if item['title'] and item['content'] and item['url'] and item['name'] and item['req_url']:
            if 'github' in item['url'] and ',禁止转载' in item['content']:
                return DropItem(f"Data not allowed`{item['title']}")
            else:
                return item
        else:
            raise DropItem(f"Data not valid`{item['title']}") 
示例22
def process_item(self, item, spider):
        for word in self.words_to_filter:
            if word in unicode(item['description']).lower():
                raise DropItem("Contains forbidden word: %s" % word)
        else:
            return item 
示例23
def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item 
示例24
def process_item(self, item, spider):

        session = db_session_mysql()
        try:
            if isinstance(item, FetchResultItem):
                if spider.name == 'weixin':
                    # 标题(微信只能通过标题去重, 因为链接带过期签名)
                    article_id_count = session.query(FetchResult) \
                        .filter(FetchResult.platform_id == WEIXIN,
                                FetchResult.article_id == get_finger(item['article_title'])) \
                        .count()
                    if article_id_count:
                        raise DropItem(
                            '%s Has been duplication of article_title: %s' % (spider.name, item['article_title']))

                if spider.name == 'weibo':
                    # 详细链接(微博可以直接通过链接去重)
                    article_url_count = session.query(FetchResult) \
                        .filter(FetchResult.platform_id == WEIBO,
                                FetchResult.article_id == get_finger(item['article_url'])) \
                        .count()
                    if article_url_count:
                        raise DropItem(
                            '%s Has been duplication of article_url: %s' % (spider.name, item['article_url']))

            return item
        except Exception as e:
            raise e
        finally:
            session.close() 
示例25
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("丢失%s期开奖数据" % item['title'])

        if valid:
            self.collection.insert(dict(item))

        return item 
示例26
def _get_stats_for_docs(self, docs, valid, item_cls=TestItem):
        stats = DummyStatsCollector()
        pipeline = JsonSchemaValidatePipeline(stats)

        for doc in docs:
            item = item_cls(**doc)
            if valid:
                pipeline.process_item(item, None)
            else:
                self.assertRaises(DropItem, pipeline.process_item, item, None)

        return stats 
示例27
def process_item(self, item, spider):
        if item is None:
            raise DropItem('Item is null')
        dir_path = self.make_dir()
        image_final_name = item['image_name'] + '-' + item['image_id'] + '-by@' + item['author'] + '.jpg'
        dest_path = os.path.join(dir_path, image_final_name)
        self.download_image(item['image_src'], dest_path)
        self.image_max_counter += 1
        if self.image_max_counter >= self.MAXIMUM_IMAGE_NUMBER:
            raise CloseSpider('Current downloaded image already equal maximum number')
        return item 
示例28
def download_image(self, src, dest):
        print('[Thread %s] preparing download image.....' % threading.current_thread().name)
        response = requests.get(src, timeout=2)
        if response.status_code == 200:
            with open(dest, 'wb') as f:
                f.write(response.content)
                print('[DOWNLOAD FINISHED] from %s to %s ' % (src, dest))
        else:
            raise DropItem('[Thread %s] request image src failed status code = %s'
                           % (threading.current_thread().name, response.status_code)) 
示例29
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            object_id = self.news_collection.insert(dict(item))
            spider.object_id = str(object_id)
            logger.info("Question added to MongoDB database!")
        return item 
示例30
def process_item(self, item, spider):
        session = self.Session()
        exist_quote = session.query(Quote).filter_by(quote_content = item["quote_content"]).first()
        if exist_quote is not None:  # the current quote exists
            raise DropItem("Duplicate item found: %s" % item["quote_content"])
            session.close()
        else:
            return item
            session.close()