Python源码示例:h2o.import_file()
示例1
def DSS_dataset_to_H2O_frame(dataset_name): #, partition_id = None
"""This function passes the path of the data files to H2O (it does not stream the data through Python)."""
dataset = dataiku.Dataset(dataset_name)
settings = dataset.get_config()
if settings['type'] not in ['Filesystem', 'UploadedFiles', 'HDFS']:
print 'Warning: Datasets of type '+settings['type']+' are not supported for now. '
'Supported types are Filesystem, UploadedFiles and HDFS.'
separator = settings['formatParams'].get('separator',"").decode('unicode_escape')
print 'separator: <' + separator.encode('unicode_escape') + '>'
if separator == '\t':
print "Warning: H2O does not seems to support empty columns when the separator is tab."
col_names = [col['name'] for col in settings['schema']['columns']]
dataset_path = dataset.get_location_info()['info']['path'].encode('utf-8')
pathsByPartition = dataset.get_files_info()['pathsByPartition']
partitions = dataset.read_partitions if dataset.read_partitions else ['NP']
files = [file for partition in partitions for file in pathsByPartition[partition]]
filepaths = [dataset_path + file['path'] for file in files if has_data(file)]
print "filepaths:"
for f in filepaths:
print f
return h2o.import_file(
path = filepaths,
destination_frame = 'DSS.H2O_connector.dataset.' + dataset.full_name + '.' + '/'.join(partitions),
header = 0 if 'parseHeaderRow' not in settings['formatParams'] else 1 if settings['formatParams']['parseHeaderRow'] else -1,
sep = separator,
col_names = col_names,
col_types=None,
na_strings=None
# ,parse_type= 'CSV' if settings['formatType']=='csv' else None
)
示例2
def test_init_read(self):
h2o.init()
train = h2o.import_file("/input/tests/data/train.csv", destination_frame="train")
self.assertEqual(100, train.nrow)
示例3
def _prepare_one_hot(file, y, exclude_cols=None):
if exclude_cols is None:
exclude_cols = []
dir_path = os.path.dirname(os.path.realpath(__file__))
frame = h2o.import_file(dir_path + "/" + file)
train, test = frame.split_frame([0.95], seed=42)
cols_to_encode = []
other_cols = []
for name, ctype in test.types.items():
if name == y or name in exclude_cols:
pass
elif ctype == "enum":
cols_to_encode.append(name)
else:
other_cols.append(name)
train_frame = train.as_data_frame()
train_encode = train_frame.loc[:, cols_to_encode]
train_other = train_frame.loc[:, other_cols + [y]]
enc = OneHotEncoder(categories='auto', handle_unknown='ignore')
enc.fit(train_encode)
colnames = []
for cidx in range(len(cols_to_encode)):
for val in enc.categories_[cidx]:
colnames.append(cols_to_encode[cidx] + "." + val)
train_encoded = enc.transform(train_encode.values).toarray()
train_encoded = pd.DataFrame(train_encoded)
train_encoded.columns = colnames
train = train_other.join(train_encoded)
train = H2OFrame(train)
test_frame = test.as_data_frame()
test_encode = test_frame.loc[:, cols_to_encode]
test_other = test_frame.loc[:, other_cols]
test_encoded = enc.transform(test_encode.values).toarray()
test_encoded = pd.DataFrame(test_encoded)
test_encoded.columns = colnames
test = test_other.join(test_encoded)
return train, test