Python源码示例:scrapy.settings.Settings()

示例1
def get_project_settings():
    if ENVVAR not in os.environ:
        project = os.environ.get('SCRAPY_PROJECT', 'default')
        init_env(project)

    settings = Settings()
    settings_module_path = os.environ.get(ENVVAR)
    if settings_module_path:
        settings.setmodule(settings_module_path, priority='project')

    # XXX: remove this hack
    pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
    if pickled_settings:
        settings.setdict(pickle.loads(pickled_settings), priority='project')

    # XXX: deprecate and remove this functionality
    env_overrides = {k[7:]: v for k, v in os.environ.items() if
                     k.startswith('SCRAPY_')}
    if env_overrides:
        settings.setdict(env_overrides, priority='project')

    return settings 
示例2
def get_project_settings():
    if ENVVAR not in os.environ:
        project = os.environ.get('SCRAPY_PROJECT', 'default')
        init_env(project)

    settings = Settings()
    settings_module_path = os.environ.get(ENVVAR)
    if settings_module_path:
        settings.setmodule(settings_module_path, priority='project')

    # XXX: remove this hack
    pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
    if pickled_settings:
        settings.setdict(pickle.loads(pickled_settings), priority='project')

    # XXX: deprecate and remove this functionality
    env_overrides = {k[7:]: v for k, v in os.environ.items() if
                     k.startswith('SCRAPY_')}
    if env_overrides:
        settings.setdict(env_overrides, priority='project')

    return settings 
示例3
def main():
	"""Main routine for the execution of the Spider"""
	# set up signal to catch items scraped
	def catch_item(sender, item, **kwargs):
		print("Item extracted:", item)
	dispatcher.connect(catch_item, signal=signals.item_passed)
	
	settings = Settings()
	settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
	settings.set("LOG_ENABLED",False)	

	# setup crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)

	# define the spider for the crawler
	crawler.crawl(EuropythonSpyder())

	# start scrapy
	print("STARTING ENGINE")
	crawler.start() #iniciar el crawler llamando al spider definido
	print("ENGINE STOPPED") 
示例4
def test_get_enabled_status():
    settings = Settings()
    # check for void settings
    assert _get_enabled_status(settings) == (False, False)
    # plugin enabled with settings
    settings.set('PAGE_STORAGE_ENABLED', True)
    assert _get_enabled_status(settings) == (True, False)
    settings.set('PAGE_STORAGE_ENABLED', None)
    # plugin enabled by spider_type
    for spider_type in ['auto', 'portia']:
        os.environ['SHUB_SPIDER_TYPE'] = spider_type
        assert _get_enabled_status(settings) == (True, False)
    os.environ['SHUB_SPIDER_TYPE'] = 'other_spider_type'
    assert _get_enabled_status(settings) == (False, False)
    # plugin enabled on error
    settings.set('PAGE_STORAGE_ON_ERROR_ENABLED', True)
    assert _get_enabled_status(settings) == (False, True) 
示例5
def fetch_url(cls, session, msites, platform_id, purpose):
        """Actual method to do fetch url action.

        Parameters
        ----------
            msites : list
                a list of Site model class, contains info to build spiders.
            platform_id : int
                id of platform, bind fetched url with this id.
            purpose : {'update', 'archive'}
                indicate which url to fetch.
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.UrlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        for ms in msites:
            for sm in build_spiders_iter(ms, purpose):
                sm['kwargs']['session'] = session
                sm['kwargs']['platform_id'] = platform_id
                process.crawl(sm['cls'], *sm['args'], **sm['kwargs'])
        process.start() 
示例6
def fetch_html(cls, session, url_tuples):
        """Actual method to do fetch html action.

        Parameters
        ----------
            session : object
                a SQLAlchemy session object.
            url_tuples : list
                a list of url tuple (id, raw, status_code).
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.HtmlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        logger.warning('Number of url to fetch html is: %s', len(url_tuples))
        process.crawl(
            HtmlSpider,
            session=session,
            url_tuples=url_tuples,
            excluded_domains=cls.conf['crawl']['excluded_domains'])
        process.start() 
示例7
def start_job(self, job=None, callback_fn=None):
        print(job)
        spider_job = job['spider_job']
        runner = job['runner']
        spider_cls = spider_job['spider_cls']
        spider_settings = spider_job['spider_settings']
        spider_kwargs = spider_job['spider_kwargs']

        def engine_stopped_callback():
            runner.transform_and_index(callback_fn=callback_fn)

        if callback_fn:
            print("""
==========================================================
WARNING: callback_fn is {}
==========================================================
Since start_job is called with callback_fn, make sure you end the reactor if you want the spider process to
stop after the callback function is executed. By default callback_fn=None will close the reactor.

To write a custom callback_fn

def callback_fn():
    print ("Write your own callback logic")
    from twisted.internet import reactor
    reactor.stop()
==========================================================
        """.format(callback_fn))

        spider = Crawler(spider_cls, Settings(spider_settings))
        spider.signals.connect(engine_stopped_callback, signals.engine_stopped)
        self.runner.crawl(spider, **spider_kwargs)
        """
        d = runner.crawl(spider, **spider_kwargs)
        # d.addBoth(engine_stopped_callback)
        """
        reactor.run() 
示例8
def test_spidermon_aws_credentials_not_set():
    settings = Settings()

    (aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)

    assert aws_access_key_id is None
    assert aws_secret_access_key is None 
示例9
def test_spidermon_aws_credentials(mocker):
    warn_mock = mocker.patch("spidermon.utils.settings.warnings.warn")
    settings = Settings(
        {
            "SPIDERMON_AWS_ACCESS_KEY": "aws_access_key",
            "SPIDERMON_AWS_SECRET_KEY": "aws_secret_key",
        }
    )

    (aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)

    assert aws_access_key_id == "aws_access_key"
    assert aws_secret_access_key == "aws_secret_key"
    warn_mock.assert_called_with(mocker.ANY, DeprecationWarning) 
示例10
def test_spidermon_aws_credentials_scrapy_like():
    settings = Settings(
        {
            "SPIDERMON_AWS_ACCESS_KEY_ID": "aws_access_key_id",
            "SPIDERMON_AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
        }
    )

    (aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)

    assert aws_access_key_id == "aws_access_key_id"
    assert aws_secret_access_key == "aws_secret_access_key" 
示例11
def test_spidermon_aws_credentials_fall_back_to_scrapy():
    settings = Settings(
        {
            "AWS_ACCESS_KEY_ID": "scrapy_aws_access_key_id",
            "AWS_SECRET_ACCESS_KEY": "scrapy_aws_secret_access_key",
        }
    )

    (aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)

    assert aws_access_key_id == "scrapy_aws_access_key_id"
    assert aws_secret_access_key == "scrapy_aws_secret_access_key" 
示例12
def test_spidermon_aws_credentials_are_preferred_over_scrapy_ones():
    settings = Settings(
        {
            "AWS_ACCESS_KEY_ID": "scrapy_aws_access_key_id",
            "AWS_SECRET_ACCESS_KEY": "scrapy_aws_secret_access_key",
            "SPIDERMON_AWS_ACCESS_KEY_ID": "spidermon_aws_access_key_id",
            "SPIDERMON_AWS_SECRET_ACCESS_KEY": "spidermon_aws_secret_access_key",
        }
    )

    (aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)

    assert aws_access_key_id == "spidermon_aws_access_key_id"
    assert aws_secret_access_key == "spidermon_aws_secret_access_key" 
示例13
def settings(request):
    """ Default scrapy-poet settings """
    s = dict(
        # collect scraped items to .collected_items attribute
        ITEM_PIPELINES={
            'tests.utils.CollectorPipeline': 100,
        },
        DOWNLOADER_MIDDLEWARES={
            'scrapy_poet.InjectionMiddleware': 543,
        },
    )
    return Settings(s) 
示例14
def __init__(self, spidercls, settings=None):
        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings: %(settings)r", {'settings': d})

        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None 
示例15
def __init__(self, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)
        self.settings = settings
        self.spider_loader = _get_spider_loader(settings)
        self._crawlers = set()
        self._active = set()
        self.bootstrap_failed = False 
示例16
def configure_logging(settings=None, install_root_handler=True):
    """
    Initialize logging defaults for Scrapy.

    :param settings: settings used to create and configure a handler for the
        root logger (default: None).
    :type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``

    :param install_root_handler: whether to install root logging handler
        (default: True)
    :type install_root_handler: bool

    This function does:

    - Route warnings and twisted logging through Python standard logging
    - Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
    - Route stdout to log if LOG_STDOUT setting is True

    When ``install_root_handler`` is True (default), this function also
    creates a handler for the root logger according to given settings
    (see :ref:`topics-logging-settings`). You can override default options
    using ``settings`` argument. When ``settings`` is empty or None, defaults
    are used.
    """
    if not sys.warnoptions:
        # Route warnings through python logging
        logging.captureWarnings(True)

    observer = twisted_log.PythonLoggingObserver('twisted')
    observer.start()

    dictConfig(DEFAULT_LOGGING)

    if isinstance(settings, dict) or settings is None:
        settings = Settings(settings)

    if settings.getbool('LOG_STDOUT'):
        sys.stdout = StreamLogger(logging.getLogger('stdout'))

    if install_root_handler:
        install_scrapy_root_handler(settings) 
示例17
def __init__(self, store_uri, download_func=None, settings=None):
        if not store_uri:
            raise NotConfigured

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        cls_name = "FilesPipeline"
        self.store = self._get_store(store_uri)
        resolve = functools.partial(self._key_for_pipe,
                                    base_class_name=cls_name,
                                    settings=settings)
        self.expires = settings.getint(
            resolve('FILES_EXPIRES'), self.EXPIRES
        )
        if not hasattr(self, "FILES_URLS_FIELD"):
            self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD
        if not hasattr(self, "FILES_RESULT_FIELD"):
            self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD
        self.files_urls_field = settings.get(
            resolve('FILES_URLS_FIELD'), self.FILES_URLS_FIELD
        )
        self.files_result_field = settings.get(
            resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD
        )

        super(FilesPipeline, self).__init__(download_func=download_func, settings=settings) 
示例18
def __init__(self, download_func=None, settings=None):
        self.download_func = download_func

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)
        resolve = functools.partial(self._key_for_pipe,
                                    base_class_name="MediaPipeline",
                                    settings=settings)
        self.allow_redirects = settings.getbool(
            resolve('MEDIA_ALLOW_REDIRECTS'), False
        )
        self._handle_statuses(self.allow_redirects) 
示例19
def __init__(self, spidercls, settings=None):
        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings: %(settings)r", {'settings': d})

        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None 
示例20
def __init__(self, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)
        self.settings = settings
        self.spider_loader = _get_spider_loader(settings)
        self._crawlers = set()
        self._active = set()
        self.bootstrap_failed = False 
示例21
def configure_logging(settings=None, install_root_handler=True):
    """
    Initialize logging defaults for Scrapy.

    :param settings: settings used to create and configure a handler for the
        root logger (default: None).
    :type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``

    :param install_root_handler: whether to install root logging handler
        (default: True)
    :type install_root_handler: bool

    This function does:

    - Route warnings and twisted logging through Python standard logging
    - Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
    - Route stdout to log if LOG_STDOUT setting is True

    When ``install_root_handler`` is True (default), this function also
    creates a handler for the root logger according to given settings
    (see :ref:`topics-logging-settings`). You can override default options
    using ``settings`` argument. When ``settings`` is empty or None, defaults
    are used.
    """
    if not sys.warnoptions:
        # Route warnings through python logging
        logging.captureWarnings(True)

    observer = twisted_log.PythonLoggingObserver('twisted')
    observer.start()

    dictConfig(DEFAULT_LOGGING)

    if isinstance(settings, dict) or settings is None:
        settings = Settings(settings)

    if settings.getbool('LOG_STDOUT'):
        sys.stdout = StreamLogger(logging.getLogger('stdout'))

    if install_root_handler:
        install_scrapy_root_handler(settings) 
示例22
def __init__(self, store_uri, download_func=None, settings=None):
        super(ImagesPipeline, self).__init__(store_uri, settings=settings,
                                             download_func=download_func)

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        resolve = functools.partial(self._key_for_pipe,
                                    base_class_name="ImagesPipeline",
                                    settings=settings)
        self.expires = settings.getint(
            resolve("IMAGES_EXPIRES"), self.EXPIRES
        )

        if not hasattr(self, "IMAGES_RESULT_FIELD"):
            self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
        if not hasattr(self, "IMAGES_URLS_FIELD"):
            self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD

        self.images_urls_field = settings.get(
            resolve('IMAGES_URLS_FIELD'),
            self.IMAGES_URLS_FIELD
        )
        self.images_result_field = settings.get(
            resolve('IMAGES_RESULT_FIELD'),
            self.IMAGES_RESULT_FIELD
        )
        self.min_width = settings.getint(
            resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
        )
        self.min_height = settings.getint(
            resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
        )
        self.thumbs = settings.get(
            resolve('IMAGES_THUMBS'), self.THUMBS
        ) 
示例23
def __init__(self, download_func=None, settings=None):
        self.download_func = download_func

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)
        resolve = functools.partial(self._key_for_pipe,
                                    base_class_name="MediaPipeline",
                                    settings=settings)
        self.allow_redirects = settings.getbool(
            resolve('MEDIA_ALLOW_REDIRECTS'), False
        )
        self._handle_statuses(self.allow_redirects) 
示例24
def setUp(self):
        self.spider = Spider('default')
        self.mocked_hsref = mock.Mock()
        self.patch = mock.patch('sh_scrapy.hsref.hsref', self.mocked_hsref)
        self.crawler_mock = mock.Mock()
        self.crawler_mock.settings = Settings(
            {'PAGE_STORAGE_ENABLED': True,
             'PAGE_STORAGE_MODE': 'VERSIONED_CACHE',
             'PAGE_STORAGE_LIMIT': 10,
             'PAGE_STORAGE_ON_ERROR_LIMIT': 5})
        self.mocked_hsref.project.collections.url = '/test/url'
        self.patch.start()
        self.instance = PageStorageMiddleware.from_crawler(self.crawler_mock) 
示例25
def test_from_crawler(self, mocked_hsref):
        crawler_mock = mock.Mock()
        crawler_mock.settings = Settings()
        self.assertRaises(NotConfigured,
                          PageStorageMiddleware.from_crawler,
                          crawler_mock)
        # test creating an instance for all other cases
        crawler_mock.settings = mock.Mock()
        mocked_values = [(True, False), (False, True), (True, True)]
        crawler_mock.settings.side_effect = mocked_values
        for _ in range(len(mocked_values)):
            assert isinstance(PageStorageMiddleware.from_crawler(crawler_mock),
                              PageStorageMiddleware) 
示例26
def start_spider(args):
	settings.LOG_LEVEL = args.log_level
	project_settings = Settings()
	project_settings.setmodule(settings)
	
	process = CrawlerProcess(project_settings)
	
	process.crawl(ImageSpider, domains=args.domains, start_urls=args.start_urls, jobname=args.jobname, stay_under=args.stay_under,
			monitor=args.monitor, user_agent=args.user_agent, minsize=args.min_size, no_cache=args.no_cache,
			images_store=args.images_store, depth_limit=args.depth_limit, url_regex=args.url_regex,
			no_cdns=args.no_cdns, auto_throttle=args.auto_throttle, log_level=args.log_level)

	process.start() 
示例27
def get_crawler_class(self, crawler):
        """
        Searches through the modules in self.__crawer_module for a crawler with
        the name passed along.

        :param str crawler: Name of the crawler to load
        :rtype: crawler-class
        """
        settings = Settings()
        settings.set('SPIDER_MODULES', [self.__crawer_module])
        spider_loader = SpiderLoader(settings)
        return spider_loader.load(crawler) 
示例28
def download_regions():
    if os.path.exists("data/regions.json"):
        os.remove("data/regions.json")

    settings = Settings()
    os.environ['SCRAPY_SETTINGS_MODULE'] = 'parser.dtpparser.settings'
    settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
    settings.setmodule(settings_module_path, priority='project')
    process = CrawlerProcess(settings)

    process.crawl(RegionSpider)
    process.start() 
示例29
def download_dtp():
    if os.path.exists("data/dtp.json"):
        os.remove("data/dtp.json")
    settings = Settings()
    os.environ['SCRAPY_SETTINGS_MODULE'] = 'parser.dtpparser.settings'
    settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
    settings.setmodule(settings_module_path, priority='project')
    process = CrawlerProcess(settings)

    process.crawl(DtpSpider)
    process.start() 
示例30
def main():
    # configure the settings for the crawler and spider
    args = parse_args()
    config = {
        'domains': args.domains,
        'directory': args.output,
        'allow': args.allow,
        'deny': args.deny,
        'unix': args.unix,
    }
    settings = Settings({
        'USER_AGENT': (
            'Wayback Machine Scraper/{0} '
            '(+https://github.com/sangaline/scrapy-wayback-machine)'
        ).format(get_distribution('wayback-machine-scraper').version),
        'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO',
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy_wayback_machine.WaybackMachineMiddleware': 5,
        },
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_DEBUG': args.verbose,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency,
        'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to),
    })

    # start the crawler
    process = CrawlerProcess(settings)
    process.crawl(MirrorSpider, **config)
    process.start()