Python源码示例:h2o.import_file()

示例1
def DSS_dataset_to_H2O_frame(dataset_name):   #, partition_id = None
    """This function passes the path of the data files to H2O (it does not stream the data through Python)."""
    dataset = dataiku.Dataset(dataset_name)
    settings = dataset.get_config()
    if settings['type'] not in ['Filesystem', 'UploadedFiles', 'HDFS']:
        print 'Warning: Datasets of type '+settings['type']+' are not supported for now. '
        'Supported types are Filesystem, UploadedFiles and HDFS.'
    separator = settings['formatParams'].get('separator',"").decode('unicode_escape')
    print 'separator: <' + separator.encode('unicode_escape') + '>'
    if separator == '\t':
        print "Warning: H2O does not seems to support empty columns when the separator is tab."
    col_names = [col['name'] for col in settings['schema']['columns']]
    dataset_path = dataset.get_location_info()['info']['path'].encode('utf-8')
    pathsByPartition = dataset.get_files_info()['pathsByPartition']
    partitions = dataset.read_partitions if dataset.read_partitions else ['NP']
    files = [file for partition in partitions for file in pathsByPartition[partition]]
    filepaths = [dataset_path + file['path'] for file in files if has_data(file)]
    print "filepaths:"
    for f in filepaths:
        print f
    return h2o.import_file(
        path = filepaths,
        destination_frame = 'DSS.H2O_connector.dataset.' + dataset.full_name + '.' + '/'.join(partitions),
        header = 0 if 'parseHeaderRow' not in settings['formatParams'] else 1 if settings['formatParams']['parseHeaderRow'] else -1,
        sep = separator,
        col_names = col_names,
        col_types=None,
        na_strings=None
        # ,parse_type= 'CSV' if settings['formatType']=='csv' else None
        ) 
示例2
def test_init_read(self):
        h2o.init()
        train = h2o.import_file("/input/tests/data/train.csv", destination_frame="train")
        self.assertEqual(100, train.nrow) 
示例3
def _prepare_one_hot(file, y, exclude_cols=None):
    if exclude_cols is None:
        exclude_cols = []
    dir_path = os.path.dirname(os.path.realpath(__file__))
    frame = h2o.import_file(dir_path + "/" + file)
    train, test = frame.split_frame([0.95], seed=42)

    cols_to_encode = []
    other_cols = []
    for name, ctype in test.types.items():
        if name == y or name in exclude_cols:
            pass
        elif ctype == "enum":
            cols_to_encode.append(name)
        else:
            other_cols.append(name)
    train_frame = train.as_data_frame()
    train_encode = train_frame.loc[:, cols_to_encode]
    train_other = train_frame.loc[:, other_cols + [y]]
    enc = OneHotEncoder(categories='auto', handle_unknown='ignore')
    enc.fit(train_encode)
    colnames = []
    for cidx in range(len(cols_to_encode)):
        for val in enc.categories_[cidx]:
            colnames.append(cols_to_encode[cidx] + "." + val)

    train_encoded = enc.transform(train_encode.values).toarray()
    train_encoded = pd.DataFrame(train_encoded)
    train_encoded.columns = colnames
    train = train_other.join(train_encoded)
    train = H2OFrame(train)

    test_frame = test.as_data_frame()
    test_encode = test_frame.loc[:, cols_to_encode]
    test_other = test_frame.loc[:, other_cols]

    test_encoded = enc.transform(test_encode.values).toarray()
    test_encoded = pd.DataFrame(test_encoded)
    test_encoded.columns = colnames
    test = test_other.join(test_encoded)

    return train, test