Python源码示例:scrapy.settings.Settings()
示例1
def get_project_settings():
if ENVVAR not in os.environ:
project = os.environ.get('SCRAPY_PROJECT', 'default')
init_env(project)
settings = Settings()
settings_module_path = os.environ.get(ENVVAR)
if settings_module_path:
settings.setmodule(settings_module_path, priority='project')
# XXX: remove this hack
pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
if pickled_settings:
settings.setdict(pickle.loads(pickled_settings), priority='project')
# XXX: deprecate and remove this functionality
env_overrides = {k[7:]: v for k, v in os.environ.items() if
k.startswith('SCRAPY_')}
if env_overrides:
settings.setdict(env_overrides, priority='project')
return settings
示例2
def get_project_settings():
if ENVVAR not in os.environ:
project = os.environ.get('SCRAPY_PROJECT', 'default')
init_env(project)
settings = Settings()
settings_module_path = os.environ.get(ENVVAR)
if settings_module_path:
settings.setmodule(settings_module_path, priority='project')
# XXX: remove this hack
pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
if pickled_settings:
settings.setdict(pickle.loads(pickled_settings), priority='project')
# XXX: deprecate and remove this functionality
env_overrides = {k[7:]: v for k, v in os.environ.items() if
k.startswith('SCRAPY_')}
if env_overrides:
settings.setdict(env_overrides, priority='project')
return settings
示例3
def main():
"""Main routine for the execution of the Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print("Item extracted:", item)
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# define the spider for the crawler
crawler.crawl(EuropythonSpyder())
# start scrapy
print("STARTING ENGINE")
crawler.start() #iniciar el crawler llamando al spider definido
print("ENGINE STOPPED")
示例4
def test_get_enabled_status():
settings = Settings()
# check for void settings
assert _get_enabled_status(settings) == (False, False)
# plugin enabled with settings
settings.set('PAGE_STORAGE_ENABLED', True)
assert _get_enabled_status(settings) == (True, False)
settings.set('PAGE_STORAGE_ENABLED', None)
# plugin enabled by spider_type
for spider_type in ['auto', 'portia']:
os.environ['SHUB_SPIDER_TYPE'] = spider_type
assert _get_enabled_status(settings) == (True, False)
os.environ['SHUB_SPIDER_TYPE'] = 'other_spider_type'
assert _get_enabled_status(settings) == (False, False)
# plugin enabled on error
settings.set('PAGE_STORAGE_ON_ERROR_ENABLED', True)
assert _get_enabled_status(settings) == (False, True)
示例5
def fetch_url(cls, session, msites, platform_id, purpose):
"""Actual method to do fetch url action.
Parameters
----------
msites : list
a list of Site model class, contains info to build spiders.
platform_id : int
id of platform, bind fetched url with this id.
purpose : {'update', 'archive'}
indicate which url to fetch.
"""
settings = Settings(cls.conf['crawl']['scrapy'])
settings.set('ITEM_PIPELINES',
{'hoaxy.crawl.pipelines.UrlPipeline': 300})
process = CrawlerProcess(settings)
sll = cls.conf['logging']['loggers']['scrapy']['level']
logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
for ms in msites:
for sm in build_spiders_iter(ms, purpose):
sm['kwargs']['session'] = session
sm['kwargs']['platform_id'] = platform_id
process.crawl(sm['cls'], *sm['args'], **sm['kwargs'])
process.start()
示例6
def fetch_html(cls, session, url_tuples):
"""Actual method to do fetch html action.
Parameters
----------
session : object
a SQLAlchemy session object.
url_tuples : list
a list of url tuple (id, raw, status_code).
"""
settings = Settings(cls.conf['crawl']['scrapy'])
settings.set('ITEM_PIPELINES',
{'hoaxy.crawl.pipelines.HtmlPipeline': 300})
process = CrawlerProcess(settings)
sll = cls.conf['logging']['loggers']['scrapy']['level']
logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
logger.warning('Number of url to fetch html is: %s', len(url_tuples))
process.crawl(
HtmlSpider,
session=session,
url_tuples=url_tuples,
excluded_domains=cls.conf['crawl']['excluded_domains'])
process.start()
示例7
def start_job(self, job=None, callback_fn=None):
print(job)
spider_job = job['spider_job']
runner = job['runner']
spider_cls = spider_job['spider_cls']
spider_settings = spider_job['spider_settings']
spider_kwargs = spider_job['spider_kwargs']
def engine_stopped_callback():
runner.transform_and_index(callback_fn=callback_fn)
if callback_fn:
print("""
==========================================================
WARNING: callback_fn is {}
==========================================================
Since start_job is called with callback_fn, make sure you end the reactor if you want the spider process to
stop after the callback function is executed. By default callback_fn=None will close the reactor.
To write a custom callback_fn
def callback_fn():
print ("Write your own callback logic")
from twisted.internet import reactor
reactor.stop()
==========================================================
""".format(callback_fn))
spider = Crawler(spider_cls, Settings(spider_settings))
spider.signals.connect(engine_stopped_callback, signals.engine_stopped)
self.runner.crawl(spider, **spider_kwargs)
"""
d = runner.crawl(spider, **spider_kwargs)
# d.addBoth(engine_stopped_callback)
"""
reactor.run()
示例8
def test_spidermon_aws_credentials_not_set():
settings = Settings()
(aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)
assert aws_access_key_id is None
assert aws_secret_access_key is None
示例9
def test_spidermon_aws_credentials(mocker):
warn_mock = mocker.patch("spidermon.utils.settings.warnings.warn")
settings = Settings(
{
"SPIDERMON_AWS_ACCESS_KEY": "aws_access_key",
"SPIDERMON_AWS_SECRET_KEY": "aws_secret_key",
}
)
(aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)
assert aws_access_key_id == "aws_access_key"
assert aws_secret_access_key == "aws_secret_key"
warn_mock.assert_called_with(mocker.ANY, DeprecationWarning)
示例10
def test_spidermon_aws_credentials_scrapy_like():
settings = Settings(
{
"SPIDERMON_AWS_ACCESS_KEY_ID": "aws_access_key_id",
"SPIDERMON_AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
}
)
(aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)
assert aws_access_key_id == "aws_access_key_id"
assert aws_secret_access_key == "aws_secret_access_key"
示例11
def test_spidermon_aws_credentials_fall_back_to_scrapy():
settings = Settings(
{
"AWS_ACCESS_KEY_ID": "scrapy_aws_access_key_id",
"AWS_SECRET_ACCESS_KEY": "scrapy_aws_secret_access_key",
}
)
(aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)
assert aws_access_key_id == "scrapy_aws_access_key_id"
assert aws_secret_access_key == "scrapy_aws_secret_access_key"
示例12
def test_spidermon_aws_credentials_are_preferred_over_scrapy_ones():
settings = Settings(
{
"AWS_ACCESS_KEY_ID": "scrapy_aws_access_key_id",
"AWS_SECRET_ACCESS_KEY": "scrapy_aws_secret_access_key",
"SPIDERMON_AWS_ACCESS_KEY_ID": "spidermon_aws_access_key_id",
"SPIDERMON_AWS_SECRET_ACCESS_KEY": "spidermon_aws_secret_access_key",
}
)
(aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)
assert aws_access_key_id == "spidermon_aws_access_key_id"
assert aws_secret_access_key == "spidermon_aws_secret_access_key"
示例13
def settings(request):
""" Default scrapy-poet settings """
s = dict(
# collect scraped items to .collected_items attribute
ITEM_PIPELINES={
'tests.utils.CollectorPipeline': 100,
},
DOWNLOADER_MIDDLEWARES={
'scrapy_poet.InjectionMiddleware': 543,
},
)
return Settings(s)
示例14
def __init__(self, spidercls, settings=None):
if isinstance(spidercls, Spider):
raise ValueError(
'The spidercls argument must be a class, not an object')
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.spidercls = spidercls
self.settings = settings.copy()
self.spidercls.update_settings(self.settings)
self.signals = SignalManager(self)
self.stats = load_object(self.settings['STATS_CLASS'])(self)
handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
logging.root.addHandler(handler)
d = dict(overridden_settings(self.settings))
logger.info("Overridden settings: %(settings)r", {'settings': d})
if get_scrapy_root_handler() is not None:
# scrapy root handler already installed: update it with new settings
install_scrapy_root_handler(self.settings)
# lambda is assigned to Crawler attribute because this way it is not
# garbage collected after leaving __init__ scope
self.__remove_handler = lambda: logging.root.removeHandler(handler)
self.signals.connect(self.__remove_handler, signals.engine_stopped)
lf_cls = load_object(self.settings['LOG_FORMATTER'])
self.logformatter = lf_cls.from_crawler(self)
self.extensions = ExtensionManager.from_crawler(self)
self.settings.freeze()
self.crawling = False
self.spider = None
self.engine = None
示例15
def __init__(self, settings=None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
self.spider_loader = _get_spider_loader(settings)
self._crawlers = set()
self._active = set()
self.bootstrap_failed = False
示例16
def configure_logging(settings=None, install_root_handler=True):
"""
Initialize logging defaults for Scrapy.
:param settings: settings used to create and configure a handler for the
root logger (default: None).
:type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``
:param install_root_handler: whether to install root logging handler
(default: True)
:type install_root_handler: bool
This function does:
- Route warnings and twisted logging through Python standard logging
- Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
- Route stdout to log if LOG_STDOUT setting is True
When ``install_root_handler`` is True (default), this function also
creates a handler for the root logger according to given settings
(see :ref:`topics-logging-settings`). You can override default options
using ``settings`` argument. When ``settings`` is empty or None, defaults
are used.
"""
if not sys.warnoptions:
# Route warnings through python logging
logging.captureWarnings(True)
observer = twisted_log.PythonLoggingObserver('twisted')
observer.start()
dictConfig(DEFAULT_LOGGING)
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
if settings.getbool('LOG_STDOUT'):
sys.stdout = StreamLogger(logging.getLogger('stdout'))
if install_root_handler:
install_scrapy_root_handler(settings)
示例17
def __init__(self, store_uri, download_func=None, settings=None):
if not store_uri:
raise NotConfigured
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
cls_name = "FilesPipeline"
self.store = self._get_store(store_uri)
resolve = functools.partial(self._key_for_pipe,
base_class_name=cls_name,
settings=settings)
self.expires = settings.getint(
resolve('FILES_EXPIRES'), self.EXPIRES
)
if not hasattr(self, "FILES_URLS_FIELD"):
self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD
if not hasattr(self, "FILES_RESULT_FIELD"):
self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD
self.files_urls_field = settings.get(
resolve('FILES_URLS_FIELD'), self.FILES_URLS_FIELD
)
self.files_result_field = settings.get(
resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD
)
super(FilesPipeline, self).__init__(download_func=download_func, settings=settings)
示例18
def __init__(self, download_func=None, settings=None):
self.download_func = download_func
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
resolve = functools.partial(self._key_for_pipe,
base_class_name="MediaPipeline",
settings=settings)
self.allow_redirects = settings.getbool(
resolve('MEDIA_ALLOW_REDIRECTS'), False
)
self._handle_statuses(self.allow_redirects)
示例19
def __init__(self, spidercls, settings=None):
if isinstance(spidercls, Spider):
raise ValueError(
'The spidercls argument must be a class, not an object')
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.spidercls = spidercls
self.settings = settings.copy()
self.spidercls.update_settings(self.settings)
self.signals = SignalManager(self)
self.stats = load_object(self.settings['STATS_CLASS'])(self)
handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
logging.root.addHandler(handler)
d = dict(overridden_settings(self.settings))
logger.info("Overridden settings: %(settings)r", {'settings': d})
if get_scrapy_root_handler() is not None:
# scrapy root handler already installed: update it with new settings
install_scrapy_root_handler(self.settings)
# lambda is assigned to Crawler attribute because this way it is not
# garbage collected after leaving __init__ scope
self.__remove_handler = lambda: logging.root.removeHandler(handler)
self.signals.connect(self.__remove_handler, signals.engine_stopped)
lf_cls = load_object(self.settings['LOG_FORMATTER'])
self.logformatter = lf_cls.from_crawler(self)
self.extensions = ExtensionManager.from_crawler(self)
self.settings.freeze()
self.crawling = False
self.spider = None
self.engine = None
示例20
def __init__(self, settings=None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
self.spider_loader = _get_spider_loader(settings)
self._crawlers = set()
self._active = set()
self.bootstrap_failed = False
示例21
def configure_logging(settings=None, install_root_handler=True):
"""
Initialize logging defaults for Scrapy.
:param settings: settings used to create and configure a handler for the
root logger (default: None).
:type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``
:param install_root_handler: whether to install root logging handler
(default: True)
:type install_root_handler: bool
This function does:
- Route warnings and twisted logging through Python standard logging
- Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
- Route stdout to log if LOG_STDOUT setting is True
When ``install_root_handler`` is True (default), this function also
creates a handler for the root logger according to given settings
(see :ref:`topics-logging-settings`). You can override default options
using ``settings`` argument. When ``settings`` is empty or None, defaults
are used.
"""
if not sys.warnoptions:
# Route warnings through python logging
logging.captureWarnings(True)
observer = twisted_log.PythonLoggingObserver('twisted')
observer.start()
dictConfig(DEFAULT_LOGGING)
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
if settings.getbool('LOG_STDOUT'):
sys.stdout = StreamLogger(logging.getLogger('stdout'))
if install_root_handler:
install_scrapy_root_handler(settings)
示例22
def __init__(self, store_uri, download_func=None, settings=None):
super(ImagesPipeline, self).__init__(store_uri, settings=settings,
download_func=download_func)
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
resolve = functools.partial(self._key_for_pipe,
base_class_name="ImagesPipeline",
settings=settings)
self.expires = settings.getint(
resolve("IMAGES_EXPIRES"), self.EXPIRES
)
if not hasattr(self, "IMAGES_RESULT_FIELD"):
self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
if not hasattr(self, "IMAGES_URLS_FIELD"):
self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD
self.images_urls_field = settings.get(
resolve('IMAGES_URLS_FIELD'),
self.IMAGES_URLS_FIELD
)
self.images_result_field = settings.get(
resolve('IMAGES_RESULT_FIELD'),
self.IMAGES_RESULT_FIELD
)
self.min_width = settings.getint(
resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
)
self.min_height = settings.getint(
resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
)
self.thumbs = settings.get(
resolve('IMAGES_THUMBS'), self.THUMBS
)
示例23
def __init__(self, download_func=None, settings=None):
self.download_func = download_func
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
resolve = functools.partial(self._key_for_pipe,
base_class_name="MediaPipeline",
settings=settings)
self.allow_redirects = settings.getbool(
resolve('MEDIA_ALLOW_REDIRECTS'), False
)
self._handle_statuses(self.allow_redirects)
示例24
def setUp(self):
self.spider = Spider('default')
self.mocked_hsref = mock.Mock()
self.patch = mock.patch('sh_scrapy.hsref.hsref', self.mocked_hsref)
self.crawler_mock = mock.Mock()
self.crawler_mock.settings = Settings(
{'PAGE_STORAGE_ENABLED': True,
'PAGE_STORAGE_MODE': 'VERSIONED_CACHE',
'PAGE_STORAGE_LIMIT': 10,
'PAGE_STORAGE_ON_ERROR_LIMIT': 5})
self.mocked_hsref.project.collections.url = '/test/url'
self.patch.start()
self.instance = PageStorageMiddleware.from_crawler(self.crawler_mock)
示例25
def test_from_crawler(self, mocked_hsref):
crawler_mock = mock.Mock()
crawler_mock.settings = Settings()
self.assertRaises(NotConfigured,
PageStorageMiddleware.from_crawler,
crawler_mock)
# test creating an instance for all other cases
crawler_mock.settings = mock.Mock()
mocked_values = [(True, False), (False, True), (True, True)]
crawler_mock.settings.side_effect = mocked_values
for _ in range(len(mocked_values)):
assert isinstance(PageStorageMiddleware.from_crawler(crawler_mock),
PageStorageMiddleware)
示例26
def start_spider(args):
settings.LOG_LEVEL = args.log_level
project_settings = Settings()
project_settings.setmodule(settings)
process = CrawlerProcess(project_settings)
process.crawl(ImageSpider, domains=args.domains, start_urls=args.start_urls, jobname=args.jobname, stay_under=args.stay_under,
monitor=args.monitor, user_agent=args.user_agent, minsize=args.min_size, no_cache=args.no_cache,
images_store=args.images_store, depth_limit=args.depth_limit, url_regex=args.url_regex,
no_cdns=args.no_cdns, auto_throttle=args.auto_throttle, log_level=args.log_level)
process.start()
示例27
def get_crawler_class(self, crawler):
"""
Searches through the modules in self.__crawer_module for a crawler with
the name passed along.
:param str crawler: Name of the crawler to load
:rtype: crawler-class
"""
settings = Settings()
settings.set('SPIDER_MODULES', [self.__crawer_module])
spider_loader = SpiderLoader(settings)
return spider_loader.load(crawler)
示例28
def download_regions():
if os.path.exists("data/regions.json"):
os.remove("data/regions.json")
settings = Settings()
os.environ['SCRAPY_SETTINGS_MODULE'] = 'parser.dtpparser.settings'
settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
settings.setmodule(settings_module_path, priority='project')
process = CrawlerProcess(settings)
process.crawl(RegionSpider)
process.start()
示例29
def download_dtp():
if os.path.exists("data/dtp.json"):
os.remove("data/dtp.json")
settings = Settings()
os.environ['SCRAPY_SETTINGS_MODULE'] = 'parser.dtpparser.settings'
settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
settings.setmodule(settings_module_path, priority='project')
process = CrawlerProcess(settings)
process.crawl(DtpSpider)
process.start()
示例30
def main():
# configure the settings for the crawler and spider
args = parse_args()
config = {
'domains': args.domains,
'directory': args.output,
'allow': args.allow,
'deny': args.deny,
'unix': args.unix,
}
settings = Settings({
'USER_AGENT': (
'Wayback Machine Scraper/{0} '
'(+https://github.com/sangaline/scrapy-wayback-machine)'
).format(get_distribution('wayback-machine-scraper').version),
'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO',
'DOWNLOADER_MIDDLEWARES': {
'scrapy_wayback_machine.WaybackMachineMiddleware': 5,
},
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_DEBUG': args.verbose,
'AUTOTHROTTLE_START_DELAY': 1,
'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency,
'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to),
})
# start the crawler
process = CrawlerProcess(settings)
process.crawl(MirrorSpider, **config)
process.start()