Python源码示例:joblib.hash()
示例1
def test_setting_ndarray(adata):
adata.obsp["a"] = np.ones((M, M))
adata.varp["a"] = np.ones((N, N))
assert np.all(adata.obsp["a"] == np.ones((M, M)))
assert np.all(adata.varp["a"] == np.ones((N, N)))
h = joblib.hash(adata)
with pytest.raises(ValueError):
adata.obsp["b"] = np.ones((int(M / 2), M))
with pytest.raises(ValueError):
adata.obsp["b"] = np.ones((M, int(M * 2)))
with pytest.raises(ValueError):
adata.varp["b"] = np.ones((int(N / 2), 10))
with pytest.raises(ValueError):
adata.varp["b"] = np.ones((N, int(N * 2)))
assert h == joblib.hash(adata)
示例2
def test_setting_sparse(adata):
obsp_sparse = sparse.random(M, M)
adata.obsp["a"] = obsp_sparse
assert not np.any((adata.obsp["a"] != obsp_sparse).data)
varp_sparse = sparse.random(N, N)
adata.varp["a"] = varp_sparse
assert not np.any((adata.varp["a"] != varp_sparse).data)
h = joblib.hash(adata)
bad_obsp_sparse = sparse.random(M * 2, M)
with pytest.raises(ValueError):
adata.obsp["b"] = bad_obsp_sparse
bad_varp_sparse = sparse.random(N * 2, N)
with pytest.raises(ValueError):
adata.varp["b"] = bad_varp_sparse
assert h == joblib.hash(adata)
示例3
def test_setting_ndarray(adata):
adata.obsm["a"] = np.ones((M, 10))
adata.varm["a"] = np.ones((N, 10))
assert np.all(adata.obsm["a"] == np.ones((M, 10)))
assert np.all(adata.varm["a"] == np.ones((N, 10)))
h = joblib.hash(adata)
with pytest.raises(ValueError):
adata.obsm["b"] = np.ones((int(M / 2), 10))
with pytest.raises(ValueError):
adata.obsm["b"] = np.ones((int(M * 2), 10))
with pytest.raises(ValueError):
adata.varm["b"] = np.ones((int(N / 2), 10))
with pytest.raises(ValueError):
adata.varm["b"] = np.ones((int(N * 2), 10))
assert h == joblib.hash(adata)
示例4
def test_setting_sparse(adata):
obsm_sparse = sparse.random(M, 100)
adata.obsm["a"] = obsm_sparse
assert not np.any((adata.obsm["a"] != obsm_sparse).data)
varm_sparse = sparse.random(N, 100)
adata.varm["a"] = varm_sparse
assert not np.any((adata.varm["a"] != varm_sparse).data)
h = joblib.hash(adata)
bad_obsm_sparse = sparse.random(M * 2, M)
with pytest.raises(ValueError):
adata.obsm["b"] = bad_obsm_sparse
bad_varm_sparse = sparse.random(N * 2, N)
with pytest.raises(ValueError):
adata.varm["b"] = bad_varm_sparse
assert h == joblib.hash(adata)
示例5
def test_set_var(adata, subset_func):
init_hash = joblib.hash(adata)
subset = adata[:, subset_func(adata.var_names)]
new_var = pd.DataFrame(
dict(a=np.ones(subset.n_vars), b=np.ones(subset.n_vars)),
index=subset.var_names,
)
assert subset.is_view
subset.var = new_var
assert not subset.is_view
assert np.all(subset.var == new_var)
assert joblib.hash(adata) == init_hash
示例6
def test_set_obsm(adata):
init_hash = joblib.hash(adata)
dim0_size = np.random.randint(2, adata.shape[0] - 1)
dim1_size = np.random.randint(1, 99)
orig_obsm_val = adata.obsm["o"].copy()
subset_idx = np.random.choice(adata.obs_names, dim0_size, replace=False)
subset = adata[subset_idx, :]
assert subset.is_view
subset.obsm = dict(o=np.ones((dim0_size, dim1_size)))
assert not subset.is_view
assert np.all(orig_obsm_val == adata.obsm["o"]) # Checking for mutation
assert np.all(subset.obsm["o"] == np.ones((dim0_size, dim1_size)))
subset = adata[subset_idx, :]
subset_hash = joblib.hash(subset)
with pytest.raises(ValueError):
subset.obsm = dict(o=np.ones((dim0_size + 1, dim1_size)))
with pytest.raises(ValueError):
subset.varm = dict(o=np.ones((dim0_size - 1, dim1_size)))
assert subset_hash == joblib.hash(subset)
# Only modification have been made to a view
assert init_hash == joblib.hash(adata)
示例7
def test_not_set_subset_X(matrix_type, subset_func):
adata = ad.AnnData(matrix_type(asarray(sparse.random(20, 20))))
init_hash = joblib.hash(adata)
orig_X_val = adata.X.copy()
while True:
subset_idx = slice_subset(adata.obs_names)
if len(adata[subset_idx, :]) > 2:
break
subset = adata[subset_idx, :]
subset = adata[:, subset_idx]
internal_idx = _normalize_index(
subset_func(np.arange(subset.X.shape[1])), subset.var_names
)
assert subset.is_view
subset.X[:, internal_idx] = 1
assert not subset.is_view
assert not np.any(asarray(adata.X != orig_X_val))
assert init_hash == joblib.hash(adata)
示例8
def test_set_subset_obsm(adata, subset_func):
init_hash = joblib.hash(adata)
orig_obsm_val = adata.obsm["o"].copy()
while True:
subset_idx = slice_subset(adata.obs_names)
if len(adata[subset_idx, :]) > 2:
break
subset = adata[subset_idx, :]
internal_idx = _normalize_index(
subset_func(np.arange(subset.obsm["o"].shape[0])), subset.obs_names
)
assert subset.is_view
subset.obsm["o"][internal_idx] = 1
assert not subset.is_view
assert np.all(adata.obsm["o"] == orig_obsm_val)
assert init_hash == joblib.hash(adata)
示例9
def test_set_subset_varm(adata, subset_func):
init_hash = joblib.hash(adata)
orig_varm_val = adata.varm["o"].copy()
while True:
subset_idx = slice_subset(adata.var_names)
if (adata[:, subset_idx]).shape[1] > 2:
break
subset = adata[:, subset_idx]
internal_idx = _normalize_index(
subset_func(np.arange(subset.varm["o"].shape[0])), subset.var_names
)
assert subset.is_view
subset.varm["o"][internal_idx] = 1
assert not subset.is_view
assert np.all(adata.varm["o"] == orig_varm_val)
assert init_hash == joblib.hash(adata)
示例10
def test_view_delitem(attr):
adata = gen_adata((10, 10))
getattr(adata, attr)["to_delete"] = np.ones((10, 10))
# Shouldn’t be a subclass, should be an ndarray
assert type(getattr(adata, attr)["to_delete"]) is np.ndarray
view = adata[5:7, :][:, :5]
adata_hash = joblib.hash(adata)
view_hash = joblib.hash(view)
getattr(view, attr).__delitem__("to_delete")
assert not view.is_view
assert "to_delete" not in getattr(view, attr)
assert "to_delete" in getattr(adata, attr)
assert adata_hash == joblib.hash(adata)
assert view_hash != joblib.hash(view)
示例11
def get_data_hashes(self, exclude_list=None, hash_type='sha1'):
"""Compute a the hash of data items
exclude_list: list or None
List of attributes to skip.
if None, skips ['metadata']
hash_type: {'sha1', 'md5', 'sha256'}
Algorithm to use for hashing
"""
if exclude_list is None:
exclude_list = ['metadata']
ret = {'hash_type': hash_type}
for key, value in self.items():
if key in exclude_list:
continue
ret[f"{key}_hash"] = joblib.hash(value, hash_name=hash_type)
return ret
示例12
def train_model(algorithm_params=None,
run_number=0, *, dataset_name, algorithm_name, hash_type,
**kwargs):
"""Train a model using the specified algorithm using the given dataset.
"""
metadata = {}
ds = Dataset.load(dataset_name)
metadata['data_hash'] = joblib.hash(ds.data, hash_name=hash_type)
metadata['target_hash'] = joblib.hash(ds.target, hash_name=hash_type)
model = available_algorithms(keys_only=False)[algorithm_name]
model.set_params(**algorithm_params)
start_time = time.time()
model.fit(ds.data, y=ds.target)
end_time = record_time_interval('train_model', start_time)
metadata['start_time'] = start_time
metadata['duration'] = end_time - start_time
return model, metadata
示例13
def test_set_obsm_key(adata):
init_hash = joblib.hash(adata)
orig_obsm_val = adata.obsm["o"].copy()
subset_obsm = adata[:50]
assert subset_obsm.is_view
subset_obsm.obsm["o"] = np.ones((50, 20))
assert not subset_obsm.is_view
assert np.all(adata.obsm["o"] == orig_obsm_val)
assert init_hash == joblib.hash(adata)
示例14
def test_set_varm_key(adata):
init_hash = joblib.hash(adata)
orig_varm_val = adata.varm["o"].copy()
subset_varm = adata[:, :50]
assert subset_varm.is_view
subset_varm.varm["o"] = np.ones((50, 20))
assert not subset_varm.is_view
assert np.all(adata.varm["o"] == orig_varm_val)
assert init_hash == joblib.hash(adata)
示例15
def test_set_obs(adata, subset_func):
init_hash = joblib.hash(adata)
subset = adata[subset_func(adata.obs_names), :]
new_obs = pd.DataFrame(
dict(a=np.ones(subset.n_obs), b=np.ones(subset.n_obs)), index=subset.obs_names,
)
assert subset.is_view
subset.obs = new_obs
assert not subset.is_view
assert np.all(subset.obs == new_obs)
assert joblib.hash(adata) == init_hash
示例16
def test_view_failed_delitem(attr):
adata = gen_adata((10, 10))
view = adata[5:7, :][:, :5]
adata_hash = joblib.hash(adata)
view_hash = joblib.hash(view)
with pytest.raises(KeyError):
getattr(view, attr).__delitem__("not a key")
assert view.is_view
assert adata_hash == joblib.hash(adata)
assert view_hash == joblib.hash(view)
示例17
def __hash__(self):
return hash(joblib.hash((self._final_estimator.coef_, self._final_estimator.intercept_)))
示例18
def check_fit_does_not_overwrite_hyper_params(Estimator):
# Check that we do not overwrite hyper-parameters in fit
estimator = _construct_instance(Estimator)
set_random_state(estimator)
# Make a physical copy of the original estimator parameters before fitting.
params = estimator.get_params()
original_params = deepcopy(params)
# Fit the model
fit_args = _make_args(estimator, "fit")
estimator.fit(*fit_args)
# Compare the state of the model parameters with the original parameters
new_params = estimator.get_params()
for param_name, original_value in original_params.items():
new_value = new_params[param_name]
# We should never change or mutate the internal state of input
# parameters by default. To check this we use the joblib.hash function
# that introspects recursively any subobjects to compute a checksum.
# The only exception to this rule of immutable constructor parameters
# is possible RandomState instance but in this check we explicitly
# fixed the random_state params recursively to be integer seeds.
assert joblib.hash(new_value) == joblib.hash(original_value), (
"Estimator %s should not change or mutate "
" the parameter %s from %s to %s during fit."
% (estimator.__class__.__name__, param_name, original_value,
new_value))
示例19
def add_url(self, url=None, *, hash_type='sha1', hash_value=None,
name=None, file_name=None, force=False, unpack_action=None):
"""
Add a URL to the file list
hash_type: {'sha1', 'md5', 'sha256'}
hash_value: string or None
if None, hash will be computed from downloaded file
file_name: string or None
Name of downloaded file. If None, will be the last component of the URL
url: string
URL to fetch
name: str
text description of this file.
force: boolean (default False)
If True, overwrite an existing entry for this file
unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None
action to take in order to unpack this file. If None, infers from file type.
"""
if url is None:
raise Exception("`url` is required")
file_name = infer_filename(file_name=file_name, url=url)
fetch_dict = {
'fetch_action': 'url',
'file_name': file_name,
'hash_type': hash_type,
'hash_value': hash_value,
'name': name,
'url': url,
}
if unpack_action:
filelist_entry.update({'unpack_action': unpack_action})
if file_name in self.file_dict and not force:
raise Exception(f"{file_name} already in file_dict. Use `force=True` to add anyway.")
self.file_dict[file_name] = fetch_dict
self.fetched_ = False
示例20
def __hash__(self):
return hash(self.to_hash())
示例21
def save_model(metadata=None, model_path=None, hash_type='sha1',
*, model_name, model):
"""Save a model to disk
Parameters
----------
model_name: str
Unique key to use as model name (and filename)
metadata: dict
Model metadata
model:
sklearn estimator representing a model
hash_type: {'sha1', 'md5'}
hash algorithm to use for joblib hashing
model_path: path, default `paths['trained_model_path']`
Where model should be saved.
Returns
-------
copy of metadata
"""
if metadata is None:
metadata = {}
else:
metadata = metadata.copy()
if model_path is None:
model_path = paths['trained_model_path']
else:
model_path = pathlib.Path(model_path)
joblib.dump(model, model_path / f"{model_name}.model")
metadata['model_hash'] = joblib.hash(model, hash_name=hash_type)
save_json(model_path / f"{model_name}.metadata", metadata)
return metadata
示例22
def test_backing(adata, tmp_path, backing_h5ad):
assert not adata.isbacked
adata.filename = backing_h5ad
adata.write()
assert not adata.file.is_open
assert adata.isbacked
assert adata[:, 0].is_view
assert adata[:, 0].X.tolist() == np.reshape([1, 4, 7], (3, 1)).tolist()
# this might give us a trouble as the user might not
# know that the file is open again....
assert adata.file.is_open
adata[:2, 0].X = [0, 0]
assert adata[:, 0].X.tolist() == np.reshape([0, 0, 7], (3, 1)).tolist()
adata_subset = adata[:2, [0, 1]]
assert adata_subset.is_view
subset_hash = joblib.hash(adata_subset)
# cannot set view in backing mode...
with pytest.raises(ValueError):
adata_subset.obs["foo"] = range(2)
with pytest.raises(ValueError):
adata_subset.var["bar"] = -12
with pytest.raises(ValueError):
adata_subset.obsm["o2"] = np.ones((2, 2))
with pytest.raises(ValueError):
adata_subset.varm["v2"] = np.zeros((2, 2))
with pytest.raises(ValueError):
adata_subset.layers["float2"] = adata_subset.layers["float"].copy()
# Things should stay the same after failed operations
assert subset_hash == joblib.hash(adata_subset)
assert adata_subset.is_view
# need to copy first
adata_subset = adata_subset.copy(tmp_path / "test.subset.h5ad")
# now transition to actual object
assert not adata_subset.is_view
adata_subset.obs["foo"] = range(2)
assert not adata_subset.is_view
assert adata_subset.isbacked
assert adata_subset.obs["foo"].tolist() == list(range(2))
# save
adata_subset.write()
# TODO: Also test updating the backing file inplace
示例23
def __init__(self,
name='datasource',
parse_function=None,
dataset_dir=None,
file_list=None):
"""Create a DataSource
Parameters
----------
name: str
name of dataset
parse_function: func (or partial)
Function that will be called to process raw data into usable Dataset
dataset_dir: path
default location for raw files
file_list: list
list of file_dicts associated with this DataSource.
Valid keys for each file_dict include:
url: (optional)
URL of resource to be fetched
hash_type: {'sha1', 'md5', 'sha256'}
Type of hash function used to verify file integrity
hash_value: string
Value of hash used to verify file integrity
file_name: string (optional)
filename to use when saving file locally. If omitted, it will be inferred from url or source_file
name: string or {'DESCR', 'LICENSE'} (optional)
description of the file. of DESCR or LICENSE, will be used as metadata
unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None
action to take in order to unpack this file. If None, infers from file type.
"""
if file_list is None:
file_list = []
if dataset_dir is None:
dataset_dir = paths['raw_data_path']
if parse_function is None:
parse_function = process_dataset_default
self.name = name
self.file_dict = {infer_filename(**item):item for item in file_list}
self.parse_function = parse_function
self.dataset_dir = dataset_dir
# sklearn-style attributes. Usually these would be set in fit()
self.fetched_ = False
self.fetched_files_ = []
self.unpacked_ = False
self.unpack_path_ = None
示例24
def add_file(self, source_file=None, *, hash_type='sha1', hash_value=None,
name=None, file_name=None, unpack_action=None,
force=False):
"""
Add a file to the file list.
This file must exist on disk, as there is no method specified for fetching it.
This is useful when the data source requires an offline procedure for downloading.
hash_type: {'sha1', 'md5', 'sha256'}
hash_value: string or None
if None, hash will be computed from specified file
file_name: string
Name of destination file. relative to paths['raw_data_dir']
name: str
text description of this file.
source_file: path
file to be copied
force: boolean (default False)
If True, overwrite an existing entry for this file
unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None
action to take in order to unpack this file. If None, infers from file type.
"""
if source_file is None:
raise Exception("`source_file` is required")
source_file = pathlib.Path(source_file)
if not source_file.exists():
logger.warning(f"{source_file} not found on disk")
file_name = infer_filename(file_name=file_name, source_file=source_file)
if hash_value is None:
logger.debug(f"Hash unspecified. Computing {hash_type} hash of {source_file.name}")
hash_value = hash_file(source_file, algorithm=hash_type).hexdigest()
fetch_dict = {
'fetch_action': 'copy',
'file_name': file_name,
'hash_type': hash_type,
'hash_value': hash_value,
'name': name,
'source_file': str(source_file),
}
if unpack_action:
fetch_dict.update({'unpack_action': unpack_action})
existing_files = [f['source_file'] for k,f in self.file_dict.items()]
existing_hashes = [f['hash_value'] for k,f in self.file_dict.items() if f['hash_value']]
if file_name in self.file_dict and not force:
raise Exception(f"{file_name} already in file_dict. Use `force=True` to add anyway.")
if str(source_file.name) in existing_files and not force:
raise Exception(f"source file: {source_file} already in file list. Use `force=True` to add anyway.")
if hash_value in existing_hashes and not force:
raise Exception(f"file with hash {hash_value} already in file list. Use `force=True` to add anyway.")
logger.warning("Reproducibility Issue: add_file is often not reproducible. If possible, use add_manual_download instead")
self.file_dict[file_name] = fetch_dict
self.fetched_ = False
示例25
def fetch(self, fetch_path=None, force=False):
"""Fetch files in the `file_dict` to `raw_data_dir` and check hashes.
Parameters
----------
fetch_path: None or string
By default, assumes dataset_dir
force: Boolean
If True, ignore the cache and re-download the fetch each time
"""
if self.fetched_ and force is False:
# validate the downloaded files:
for filename, item in self.file_dict.items():
raw_data_file = paths['raw_data_path'] / filename
if not raw_data_file.exists():
logger.warning(f"{raw_data_file.name} missing. Invalidating fetch cache")
self.fetched_ = False
break
raw_file_hash = hash_file(raw_data_file, algorithm=item['hash_type']).hexdigest()
if raw_file_hash != item['hash_value']:
logger.warning(f"{raw_data_file.name} {item['hash_type']} hash invalid ({raw_file_hash} != {item['hash_value']}). Invalidating fetch cache.")
self.fetched_ = False
break
else:
logger.debug(f'Data Source {self.name} is already fetched. Skipping')
return
if fetch_path is None:
fetch_path = self.dataset_dir
else:
fetch_path = pathlib.Path(fetch_path)
self.fetched_ = False
self.fetched_files_ = []
for key, item in self.file_dict.items():
status, result, hash_value = fetch_file(**item)
logger.debug(f"Fetching {key}: status:{status}")
if status: # True (cached) or HTTP Code (successful download)
item['hash_value'] = hash_value
item['file_name'] = result.name
self.fetched_files_.append(result)
else:
if item.get('fetch_action', False) != 'message':
logger.error(f"fetch of {key} returned: {result}")
break
else:
self.fetched_ = True
self.unpacked_ = False
return self.fetched_