Python源码示例:scrapy.exceptions.NotConfigured()
示例1
def parse(self, response):
if not hasattr(self, 'parse_node'):
raise NotConfigured('You must define parse_node method in order to scrape this XML feed')
response = self.adapt_response(response)
if self.iterator == 'iternodes':
nodes = self._iternodes(response)
elif self.iterator == 'xml':
selector = Selector(response, type='xml')
self._register_namespaces(selector)
nodes = selector.xpath('//%s' % self.itertag)
elif self.iterator == 'html':
selector = Selector(response, type='html')
self._register_namespaces(selector)
nodes = selector.xpath('//%s' % self.itertag)
else:
raise NotSupported('Unsupported node iterator')
return self.parse_nodes(response, nodes)
示例2
def from_settings(cls, settings, crawler=None):
mwlist = cls._get_mwlist_from_settings(settings)
middlewares = []
enabled = []
for clspath in mwlist:
try:
mwcls = load_object(clspath)
mw = create_instance(mwcls, settings, crawler)
middlewares.append(mw)
enabled.append(clspath)
except NotConfigured as e:
if e.args:
clsname = clspath.split('.')[-1]
logger.warning("Disabled %(clsname)s: %(eargs)s",
{'clsname': clsname, 'eargs': e.args[0]},
extra={'crawler': crawler})
logger.info("Enabled %(componentname)ss:\n%(enabledlist)s",
{'componentname': cls.component_name,
'enabledlist': pprint.pformat(enabled)},
extra={'crawler': crawler})
return cls(*middlewares)
示例3
def __init__(self, crawler):
if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):
raise NotConfigured
if not TWISTED_CONCH_AVAILABLE:
raise NotConfigured(
'TELNETCONSOLE_ENABLED setting is True but required twisted '
'modules failed to import:\n' + _TWISTED_CONCH_TRACEBACK)
self.crawler = crawler
self.noisy = False
self.portrange = [int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT')]
self.host = crawler.settings['TELNETCONSOLE_HOST']
self.username = crawler.settings['TELNETCONSOLE_USERNAME']
self.password = crawler.settings['TELNETCONSOLE_PASSWORD']
if not self.password:
self.password = binascii.hexlify(os.urandom(8)).decode('utf8')
logger.info('Telnet Password: %s', self.password)
self.crawler.signals.connect(self.start_listening, signals.engine_started)
self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
示例4
def __init__(self, settings):
self.settings = settings
self.urifmt = settings['FEED_URI']
if not self.urifmt:
raise NotConfigured
self.format = settings['FEED_FORMAT'].lower()
self.export_encoding = settings['FEED_EXPORT_ENCODING']
self.storages = self._load_components('FEED_STORAGES')
self.exporters = self._load_components('FEED_EXPORTERS')
if not self._storage_supported(self.urifmt):
raise NotConfigured
if not self._exporter_supported(self.format):
raise NotConfigured
self.store_empty = settings.getbool('FEED_STORE_EMPTY')
self._exporting = False
self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') or None
self.indent = None
if settings.get('FEED_EXPORT_INDENT') is not None:
self.indent = settings.getint('FEED_EXPORT_INDENT')
uripar = settings['FEED_URI_PARAMS']
self._uripar = load_object(uripar) if uripar else lambda x, y: None
示例5
def _load_handler(self, scheme, skip_lazy=False):
path = self._schemes[scheme]
try:
dhcls = load_object(path)
if skip_lazy and getattr(dhcls, 'lazy', True):
return None
dh = dhcls(self._crawler.settings)
except NotConfigured as ex:
self._notconfigured[scheme] = str(ex)
return None
except Exception as ex:
logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
{"clspath": path, "scheme": scheme},
exc_info=True, extra={'crawler': self._crawler})
self._notconfigured[scheme] = str(ex)
return None
else:
self._handlers[scheme] = dh
return dh
示例6
def parse(self, response):
if not hasattr(self, 'parse_node'):
raise NotConfigured('You must define parse_node method in order to scrape this XML feed')
response = self.adapt_response(response)
if self.iterator == 'iternodes':
nodes = self._iternodes(response)
elif self.iterator == 'xml':
selector = Selector(response, type='xml')
self._register_namespaces(selector)
nodes = selector.xpath('//%s' % self.itertag)
elif self.iterator == 'html':
selector = Selector(response, type='html')
self._register_namespaces(selector)
nodes = selector.xpath('//%s' % self.itertag)
else:
raise NotSupported('Unsupported node iterator')
return self.parse_nodes(response, nodes)
示例7
def from_settings(cls, settings, crawler=None):
mwlist = cls._get_mwlist_from_settings(settings)
middlewares = []
enabled = []
for clspath in mwlist:
try:
mwcls = load_object(clspath)
mw = create_instance(mwcls, settings, crawler)
middlewares.append(mw)
enabled.append(clspath)
except NotConfigured as e:
if e.args:
clsname = clspath.split('.')[-1]
logger.warning("Disabled %(clsname)s: %(eargs)s",
{'clsname': clsname, 'eargs': e.args[0]},
extra={'crawler': crawler})
logger.info("Enabled %(componentname)ss:\n%(enabledlist)s",
{'componentname': cls.component_name,
'enabledlist': pprint.pformat(enabled)},
extra={'crawler': crawler})
return cls(*middlewares)
示例8
def __init__(self, crawler):
if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):
raise NotConfigured
if not TWISTED_CONCH_AVAILABLE:
raise NotConfigured(
'TELNETCONSOLE_ENABLED setting is True but required twisted '
'modules failed to import:\n' + _TWISTED_CONCH_TRACEBACK)
self.crawler = crawler
self.noisy = False
self.portrange = [int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT')]
self.host = crawler.settings['TELNETCONSOLE_HOST']
self.username = crawler.settings['TELNETCONSOLE_USERNAME']
self.password = crawler.settings['TELNETCONSOLE_PASSWORD']
if not self.password:
self.password = binascii.hexlify(os.urandom(8)).decode('utf8')
logger.info('Telnet Password: %s', self.password)
self.crawler.signals.connect(self.start_listening, signals.engine_started)
self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
示例9
def __init__(self, settings):
self.settings = settings
self.urifmt = settings['FEED_URI']
if not self.urifmt:
raise NotConfigured
self.format = settings['FEED_FORMAT'].lower()
self.export_encoding = settings['FEED_EXPORT_ENCODING']
self.storages = self._load_components('FEED_STORAGES')
self.exporters = self._load_components('FEED_EXPORTERS')
if not self._storage_supported(self.urifmt):
raise NotConfigured
if not self._exporter_supported(self.format):
raise NotConfigured
self.store_empty = settings.getbool('FEED_STORE_EMPTY')
self._exporting = False
self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') or None
self.indent = None
if settings.get('FEED_EXPORT_INDENT') is not None:
self.indent = settings.getint('FEED_EXPORT_INDENT')
uripar = settings['FEED_URI_PARAMS']
self._uripar = load_object(uripar) if uripar else lambda x, y: None
示例10
def _load_handler(self, scheme, skip_lazy=False):
path = self._schemes[scheme]
try:
dhcls = load_object(path)
if skip_lazy and getattr(dhcls, 'lazy', True):
return None
dh = dhcls(self._crawler.settings)
except NotConfigured as ex:
self._notconfigured[scheme] = str(ex)
return None
except Exception as ex:
logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
{"clspath": path, "scheme": scheme},
exc_info=True, extra={'crawler': self._crawler})
self._notconfigured[scheme] = str(ex)
return None
else:
self._handlers[scheme] = dh
return dh
示例11
def from_crawler(cls, crawler):
s = crawler.settings
proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None)
if proxy_path is not None:
with codecs.open(proxy_path, 'r', encoding='utf8') as f:
proxy_list = [line.strip() for line in f if line.strip()]
else:
proxy_list = s.getlist('ROTATING_PROXY_LIST')
if not proxy_list:
raise NotConfigured()
mw = cls(
proxy_list=proxy_list,
logstats_interval=s.getfloat('ROTATING_PROXY_LOGSTATS_INTERVAL', 30),
stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False),
max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5),
backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300),
backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600),
crawler=crawler,
)
crawler.signals.connect(mw.engine_started,
signal=signals.engine_started)
crawler.signals.connect(mw.engine_stopped,
signal=signals.engine_stopped)
return mw
示例12
def _load_jsonschema_validator(cls, schema):
if isinstance(schema, six.string_types):
schema = get_schema_from(schema)
if not isinstance(schema, dict):
raise NotConfigured(
"Invalid schema, jsonschemas must be defined as:\n"
"- a python dict.\n"
"- an object path to a python dict.\n"
"- an object path to a JSON string.\n"
"- a path to a JSON file."
)
return JSONSchemaValidator(schema)
示例13
def _load_schematics_validator(cls, model_path):
model_class = load_object(model_path)
if not issubclass(model_class, Model):
raise NotConfigured(
"Invalid model, models must subclass schematics.models.Model"
)
return SchematicsValidator(model_class)
示例14
def parse(self, response):
if not hasattr(self, 'parse_row'):
raise NotConfigured('You must define parse_row method in order to scrape this CSV feed')
response = self.adapt_response(response)
return self.parse_rows(response)
示例15
def skip_if_no_boto():
try:
is_botocore()
except NotConfigured as e:
raise SkipTest(e)
示例16
def project_data_dir(project='default'):
"""Return the current project data dir, creating it if it doesn't exist"""
if not inside_project():
raise NotConfigured("Not inside a project")
cfg = get_config()
if cfg.has_option(DATADIR_CFG_SECTION, project):
d = cfg.get(DATADIR_CFG_SECTION, project)
else:
scrapy_cfg = closest_scrapy_cfg()
if not scrapy_cfg:
raise NotConfigured("Unable to find scrapy.cfg file to infer project data dir")
d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
if not exists(d):
os.makedirs(d)
return d
示例17
def _load_components(self, setting_prefix):
conf = without_none_values(self.settings.getwithbase(setting_prefix))
d = {}
for k, v in conf.items():
try:
d[k] = load_object(v)
except NotConfigured:
pass
return d
示例18
def _storage_supported(self, uri):
scheme = urlparse(uri).scheme
if scheme in self.storages:
try:
self._get_storage(uri)
return True
except NotConfigured as e:
logger.error("Disabled feed storage scheme: %(scheme)s. "
"Reason: %(reason)s",
{'scheme': scheme, 'reason': str(e)})
else:
logger.error("Unknown feed storage scheme: %(scheme)s",
{'scheme': scheme})
示例19
def parse(self, response):
if not hasattr(self, 'parse_row'):
raise NotConfigured('You must define parse_row method in order to scrape this CSV feed')
response = self.adapt_response(response)
return self.parse_rows(response)
示例20
def skip_if_no_boto():
try:
is_botocore()
except NotConfigured as e:
raise SkipTest(e)
示例21
def project_data_dir(project='default'):
"""Return the current project data dir, creating it if it doesn't exist"""
if not inside_project():
raise NotConfigured("Not inside a project")
cfg = get_config()
if cfg.has_option(DATADIR_CFG_SECTION, project):
d = cfg.get(DATADIR_CFG_SECTION, project)
else:
scrapy_cfg = closest_scrapy_cfg()
if not scrapy_cfg:
raise NotConfigured("Unable to find scrapy.cfg file to infer project data dir")
d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
if not exists(d):
os.makedirs(d)
return d
示例22
def _load_components(self, setting_prefix):
conf = without_none_values(self.settings.getwithbase(setting_prefix))
d = {}
for k, v in conf.items():
try:
d[k] = load_object(v)
except NotConfigured:
pass
return d
示例23
def _storage_supported(self, uri):
scheme = urlparse(uri).scheme
if scheme in self.storages:
try:
self._get_storage(uri)
return True
except NotConfigured as e:
logger.error("Disabled feed storage scheme: %(scheme)s. "
"Reason: %(reason)s",
{'scheme': scheme, 'reason': str(e)})
else:
logger.error("Unknown feed storage scheme: %(scheme)s",
{'scheme': scheme})
示例24
def __init__(self, crawler):
settings = crawler.settings
spider = crawler.spider
if not any(
self.__class__.__name__ in s
for s in settings.getwithbase('SPIDER_MIDDLEWARES').keys()
):
raise ValueError(
'%s must be in SPIDER_MIDDLEWARES' % (
self.__class__.__name__,))
if not settings.getbool('AUTOUNIT_ENABLED'):
raise NotConfigured('scrapy-autounit is not enabled')
if settings.getint('CONCURRENT_REQUESTS') > 1:
logger.warn(
'Recording with concurrency > 1! '
'Data races in shared object modification may create broken '
'tests.'
)
self.max_fixtures = settings.getint(
'AUTOUNIT_MAX_FIXTURES_PER_CALLBACK',
default=10
)
self.max_fixtures = \
self.max_fixtures if self.max_fixtures >= 10 else 10
self.base_path = settings.get(
'AUTOUNIT_BASE_PATH',
default=os.path.join(get_project_dir(), 'autounit')
)
create_dir(self.base_path, exist_ok=True)
clear_fixtures(self.base_path, sanitize_module_name(spider.name))
self.fixture_counters = {}
示例25
def from_crawler(cls, crawler):
s = crawler.settings
if not s.getbool('CRAWL_ONCE_ENABLED', True):
raise NotConfigured()
path = data_path(s.get('CRAWL_ONCE_PATH', 'crawl_once'),
createdir=True)
default = s.getbool('CRAWL_ONCE_DEFAULT', default=False)
o = cls(path, crawler.stats, default)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
示例26
def test_not_configured():
crawler = get_crawler(settings_dict={'CRAWL_ONCE_ENABLED': False})
with pytest.raises(NotConfigured):
CrawlOnceMiddleware.from_crawler(crawler)
示例27
def from_crawler(cls, crawler):
dsn = crawler.settings.get("SENTRY_DSN", None)
if dsn is None:
raise NotConfigured('No SENTRY_DSN configured')
return cls(dsn=dsn)
示例28
def test_from_crawler(self, mocked_hsref):
crawler_mock = mock.Mock()
crawler_mock.settings = Settings()
self.assertRaises(NotConfigured,
PageStorageMiddleware.from_crawler,
crawler_mock)
# test creating an instance for all other cases
crawler_mock.settings = mock.Mock()
mocked_values = [(True, False), (False, True), (True, True)]
crawler_mock.settings.side_effect = mocked_values
for _ in range(len(mocked_values)):
assert isinstance(PageStorageMiddleware.from_crawler(crawler_mock),
PageStorageMiddleware)
示例29
def from_crawler(cls, crawler):
spidermon_enabled = crawler.settings.getbool("SPIDERMON_ENABLED")
if not spidermon_enabled:
return PassThroughPipeline()
validators = defaultdict(list)
allowed_types = (list, tuple, dict)
def set_validators(loader, schema):
if type(schema) in (list, tuple):
schema = {Item: schema}
for obj, paths in schema.items():
key = obj.__name__
paths = paths if type(paths) in (list, tuple) else [paths]
objects = [loader(v) for v in paths]
validators[key].extend(objects)
for loader, name in [
(cls._load_jsonschema_validator, "SPIDERMON_VALIDATION_SCHEMAS"),
(cls._load_schematics_validator, "SPIDERMON_VALIDATION_MODELS"),
]:
res = crawler.settings.get(name)
if not res:
continue
if type(res) not in allowed_types:
raise NotConfigured(
"Invalid <{}> type for <{}> settings, dict or list/tuple"
"is required".format(type(res), name)
)
set_validators(loader, res)
if not validators:
raise NotConfigured("No validators were found")
return cls(
validators=validators,
stats=crawler.stats,
drop_items_with_errors=crawler.settings.getbool(
"SPIDERMON_VALIDATION_DROP_ITEMS_WITH_ERRORS"
),
add_errors_to_items=crawler.settings.getbool(
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS"
),
errors_field=crawler.settings.get("SPIDERMON_VALIDATION_ERRORS_FIELD"),
)