Python源码示例:scrapy.exceptions.DropItem()
示例1
def process_item(self, item, spider):
if spider.name not in ['meituan']:
return item
if self.filter_dic.get(item['restaurant_name']) == item['address']:
print(item['restaurant_name'])
print(item['address'])
raise DropItem("Duplicate item found: %s" % item)
else:
self.filter_dic[item['restaurant_name']] = item['address']
try:
item['lng'], item['lat'] = gaode_to_baidu(float(item['lng']), float(item['lat']))
item['province_code'] = pinyin.get(item['province'])
item['city_code'] = pinyin.get(item['city'])
item['region_code'] = pinyin.get(item['region'])
item['area_code'] = pinyin.get(item['area'])
except BaseException as e:
print(e)
return item
示例2
def item_completed(self, results, item, info):
result = {}
for n, r in enumerate(results):
ok, x = r
if ok:
result[x["url"]] = x["path"]
else:
result[item[self.URLS_NAME][n]] = x.getErrorMessage()
# TODO: Save the result
# file_paths = [x['path'] for ok, x in results if ok]
# if not file_paths:
# raise DropItem("Item contains no files")
# item['image_paths'] = file_paths
# return item
return super(GroupDownPipelineMinix, self).item_completed(results, item, info)
示例3
def process_item(self, item, spider):
if not isinstance(item, JsonSchemaItem):
return item
errors = list(item.validator.iter_errors(dict(item)))
paths_messages = []
for error in errors:
absolute_path = list(error.absolute_path)
# error path is not available when required field is not filled
# so we parse error message. Nasty.
required_match = self.REQUIRED_RE.search(error.message)
if required_match:
absolute_path.append(required_match.group(1))
path = '.'.join(map(str, absolute_path))
self.stats.inc_value(self.STAT_FMT.format(field=path))
paths_messages.append((path, error.message))
if errors:
error_msg = ''
for path, message in paths_messages:
error_msg += u'{}: {}\n'.format(path, message)
raise DropItem(u'schema validation failed: \n {}'.format(error_msg))
return item
示例4
def _itemproc_finished(self, output, item, response, spider):
"""ItemProcessor finished for the given ``item`` and returned ``output``
"""
self.slot.itemproc_size -= 1
if isinstance(output, Failure):
ex = output.value
if isinstance(ex, DropItem):
logkws = self.logformatter.dropped(item, ex, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_dropped, item=item, response=response,
spider=spider, exception=output.value)
else:
logger.error('Error processing %(item)s', {'item': item},
exc_info=failure_to_exc_info(output),
extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_error, item=item, response=response,
spider=spider, failure=output)
else:
logkws = self.logformatter.scraped(output, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_scraped, item=output, response=response,
spider=spider)
示例5
def _itemproc_finished(self, output, item, response, spider):
"""ItemProcessor finished for the given ``item`` and returned ``output``
"""
self.slot.itemproc_size -= 1
if isinstance(output, Failure):
ex = output.value
if isinstance(ex, DropItem):
logkws = self.logformatter.dropped(item, ex, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_dropped, item=item, response=response,
spider=spider, exception=output.value)
else:
logger.error('Error processing %(item)s', {'item': item},
exc_info=failure_to_exc_info(output),
extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_error, item=item, response=response,
spider=spider, failure=output)
else:
logkws = self.logformatter.scraped(output, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_scraped, item=output, response=response,
spider=spider)
示例6
def process_item(self, item, spider):
item['organization'] = spider.organization
if 'event_time' in item:
item['event_time']['date_format'] = spider.date_format
loader = EventLoader(**item)
# see if there is a custom filter for the item
if not spider.item_filter(item):
raise DropItem('Custom item filter did not allow this event')
if 'event_time' in loader.item:
time = loader.item['event_time']
if self.time_utils.time_range_is_between(time['start_timestamp'], time['end_timestamp'], spider.start_timestamp, spider.end_timestamp):
return loader.item
else:
raise DropItem('Event is not in the configured timeframe')
else:
return loader.item
示例7
def process_exception(self, request, exception, spider):
if isinstance(exception, (IgnoreRequest, DropItem)):
return
if not self._is_enabled_for_request(request):
return
autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)
stop_time = time.time()
latency = time.time() - autoextract['timing']['start_ts']
autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})
# Make sure to log all unknown failures
logger.warning('AutoExtract failure after %.3fs for %s: %s',
latency,
autoextract['original_url'],
repr(exception),
extra={'spider': spider})
request.meta['autoextract'] = autoextract
ex_class = global_object_name(exception.__class__)
self.inc_metric('autoextract/errors/total_count', spider=spider)
self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider)
示例8
def process_item(self, item, spider):
if spider.name in ['RssCrawler', 'GdeltCrawler']:
# Search the CurrentVersion table for a version of the article
try:
self.cursor.execute(self.compare_versions, (item['url'],))
except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError,
pymysql.IntegrityError, TypeError) as error:
self.log.error("Something went wrong in rss query: %s", error)
# Save the result of the query. Must be done before the add,
# otherwise the result will be overwritten in the buffer
old_version = self.cursor.fetchone()
if old_version is not None and (datetime.datetime.strptime(
item['download_date'], "%y-%m-%d %H:%M:%S") -
old_version[3]) \
< datetime.timedelta(hours=self.delta_time):
# Compare the two download dates. index 3 of old_version
# corresponds to the download_date attribute in the DB
raise DropItem("Article in DB too recent. Not saving.")
return item
示例9
def process_item(self, item, spider):
def raise_if_missing(name, item):
if name not in item:
raise DropItem(
'The required field "{}" is missing in: {}.'.format(name, item)
)
# Required fields for all items
for required in ("id", "title", "link"):
raise_if_missing(required, item)
# Required fields for FeedEntryItems
if isinstance(item, FeedEntryItem):
for required in ("updated",):
raise_if_missing(required, item)
return item
示例10
def parse(self, response):
"""
Default callback function with response for the crawled url
https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse
"""
response = response.replace(body=re.sub(r"<br\s*[\/]?>", "\n", response.body.decode('utf=8')))
property_key = response.url.split('=')[1].replace('&', '')
# logging.debug("Parsing property_key: %s", property_key)
if 'No Data at this time' in response.text:
msg = "No data for " + response.url
logging.warning(msg)
raise DropItem(msg)
else:
property_info = self.parse_property_info(response)
property_values = self.parse_property_values(response)
property_sales = self.parse_property_sales(response)
property_info['sales'] = property_sales
property_info['values'] = property_values
property_info['property_key'] = property_key
yield Property(property_info)
示例11
def process_item(self, item, spider):
"""Main function that process URL item (first phase)."""
# validate URL length
if len(item['raw']) > MAX_URL_LEN:
item['raw'] = item['raw'][:MAX_URL_LEN]
logger.error('Raw URL too long, trucate it! %r', item['raw'])
# parse raw URL
purl = get_parsed_url(item['raw'])
if purl is None or purl.hostname is None:
raise DropItem('Invalide URL')
site_id = belongs_to_site(purl.hostname, self.site_tuples)
if site_id is None:
raise DropItem('Offsite domain: %s', item)
item['site_id'] = site_id
# insert URL into table
try:
get_or_create_murl(spider.session, item, spider.platform_id)
except SQLAlchemyError as e:
logger.error(e)
spider.session.rollback()
raise DropItem('Fail to insert database of url: %s', item)
return item
示例12
def checkInvalidKeys(self, item):
""" Checks Keys For Invalid Entries Such as None/Empty """
allowedKeys = {
'None': ["image"],
'Empty': ["image"]
}
for key in item:
try:
if (item[key] == None or item[key] == "Error") and key not in allowedKeys['None']:
raise DropItem("Required Key " + str(key) + " is None")
if(type(item[key]) is str and key not in allowedKeys['Empty']):
if len(item[key]) == 0:
raise DropItem("Required Key " + str(key) + " is Empty")
except DropItem:
pass
except Exception as e:
logger.error(__name__ + " Exception: " + str(e))
continue
示例13
def _process_verified_item(self, item, spider):
if item['incr'] == '-inf' or item['incr'] < 0:
raise DropItem('item verification has failed')
self.redis_con.zadd(item['queue'], item['verified_time'], item['url'])
示例14
def _process_speed_item(self, item, spider):
if item['incr'] == '-inf' or item['incr'] < 0:
raise DropItem('item verification has failed')
self.redis_con.zadd(item['queue'], item['response_time'], item['url'])
示例15
def process_item(self, item, spider):
if item['level'] == 2 and item['set_name'] is None:
raise DropItem('set name is empty')
return item
示例16
def process_item(self, item, spider):
if item['level'] == 1 and self.__insert_vedio(item):
return item
elif item['level'] == 2 and self.__update_vedio(item):
return item
raise DropItem('fail to store data')
示例17
def get_media_requests(self, item, info):
# check for mandatory fields
for x in ["vendor", "url"]:
if x not in item:
raise DropItem(
"Missing required field '%s' for item: " % (x, item))
# resolve dynamic redirects in urls
for x in ["mib", "sdk", "url"]:
if x in item:
split = urlparse.urlsplit(item[x])
# remove username/password if only one provided
if split.username or split.password and not (split.username and split.password):
item[x] = urlparse.urlunsplit(
(split[0], split[1][split[1].find("@") + 1:], split[2], split[3], split[4]))
if split.scheme == "http":
item[x] = urllib.urlopen(item[x]).geturl()
# check for filtered url types in path
url = urlparse.urlparse(item["url"])
if any(url.path.endswith(x) for x in [".pdf", ".php", ".txt", ".doc", ".rtf", ".docx", ".htm", ".html", ".md5", ".sha1", ".torrent"]):
raise DropItem("Filtered path extension: %s" % url.path)
elif any(x in url.path for x in ["driver", "utility", "install", "wizard", "gpl", "login"]):
raise DropItem("Filtered path type: %s" % url.path)
# generate list of url's to download
item[self.files_urls_field] = [item[x]
for x in ["mib", "url"] if x in item]
# pass vendor so we can generate the correct file path and name
return [Request(x, meta={"ftp_user": "anonymous", "ftp_password": "chrome@example.com", "vendor": item["vendor"]}) for x in item[self.files_urls_field]]
# overrides function from FilesPipeline
示例18
def _drop_item(self, item, errors):
"""
This method drops the item after detecting validation errors. Note
that you could override it to add more details about the item that
is being dropped or to drop the item only when some specific errors
are detected.
"""
self.stats.add_dropped_item()
raise DropItem("Validation failed!")
示例19
def process_item(self, item, spider):
if (isinstance(item, BusinessItem) or isinstance(item, ReviewItem)
or isinstance(item, UserItem)):
what = item_type(item)
key = '{0}_id'.format(what)
value = item[key]
if value in self.seen_items[what]:
raise DropItem('Duplicate {0} found: {1}'.format(what, item))
else:
self.seen_items[what].add(value)
return item
# Shamelessly copied from http://stackoverflow.com/q/12230332
示例20
def process_item(self, item, spider):
if item['pid'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.add(item['pid'])
return item
示例21
def process_item(self, item, spider):
if item['title'] and item['content'] and item['url'] and item['name'] and item['req_url']:
if 'github' in item['url'] and ',禁止转载' in item['content']:
return DropItem(f"Data not allowed`{item['title']}")
else:
return item
else:
raise DropItem(f"Data not valid`{item['title']}")
示例22
def process_item(self, item, spider):
for word in self.words_to_filter:
if word in unicode(item['description']).lower():
raise DropItem("Contains forbidden word: %s" % word)
else:
return item
示例23
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
示例24
def process_item(self, item, spider):
session = db_session_mysql()
try:
if isinstance(item, FetchResultItem):
if spider.name == 'weixin':
# 标题(微信只能通过标题去重, 因为链接带过期签名)
article_id_count = session.query(FetchResult) \
.filter(FetchResult.platform_id == WEIXIN,
FetchResult.article_id == get_finger(item['article_title'])) \
.count()
if article_id_count:
raise DropItem(
'%s Has been duplication of article_title: %s' % (spider.name, item['article_title']))
if spider.name == 'weibo':
# 详细链接(微博可以直接通过链接去重)
article_url_count = session.query(FetchResult) \
.filter(FetchResult.platform_id == WEIBO,
FetchResult.article_id == get_finger(item['article_url'])) \
.count()
if article_url_count:
raise DropItem(
'%s Has been duplication of article_url: %s' % (spider.name, item['article_url']))
return item
except Exception as e:
raise e
finally:
session.close()
示例25
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("丢失%s期开奖数据" % item['title'])
if valid:
self.collection.insert(dict(item))
return item
示例26
def _get_stats_for_docs(self, docs, valid, item_cls=TestItem):
stats = DummyStatsCollector()
pipeline = JsonSchemaValidatePipeline(stats)
for doc in docs:
item = item_cls(**doc)
if valid:
pipeline.process_item(item, None)
else:
self.assertRaises(DropItem, pipeline.process_item, item, None)
return stats
示例27
def process_item(self, item, spider):
if item is None:
raise DropItem('Item is null')
dir_path = self.make_dir()
image_final_name = item['image_name'] + '-' + item['image_id'] + '-by@' + item['author'] + '.jpg'
dest_path = os.path.join(dir_path, image_final_name)
self.download_image(item['image_src'], dest_path)
self.image_max_counter += 1
if self.image_max_counter >= self.MAXIMUM_IMAGE_NUMBER:
raise CloseSpider('Current downloaded image already equal maximum number')
return item
示例28
def download_image(self, src, dest):
print('[Thread %s] preparing download image.....' % threading.current_thread().name)
response = requests.get(src, timeout=2)
if response.status_code == 200:
with open(dest, 'wb') as f:
f.write(response.content)
print('[DOWNLOAD FINISHED] from %s to %s ' % (src, dest))
else:
raise DropItem('[Thread %s] request image src failed status code = %s'
% (threading.current_thread().name, response.status_code))
示例29
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
object_id = self.news_collection.insert(dict(item))
spider.object_id = str(object_id)
logger.info("Question added to MongoDB database!")
return item
示例30
def process_item(self, item, spider):
session = self.Session()
exist_quote = session.query(Quote).filter_by(quote_content = item["quote_content"]).first()
if exist_quote is not None: # the current quote exists
raise DropItem("Duplicate item found: %s" % item["quote_content"])
session.close()
else:
return item
session.close()