Java源码示例:parquet.hadoop.metadata.CompressionCodecName
示例1
private ExaParquetWriterImpl(final MessageType schema,
final int numColumns,
final Configuration conf,
final Path path,
final String compressionType,
final ExaIterator exa,
final int firstColumnIndex,
final List<Integer> dynamicPartitionExaColNums) throws Exception {
System.out.println("Path: " + path.toString());
System.out.println("Parquet schema:\n" + schema);
TupleWriteSupport.setSchema(schema, conf);
this.writer = new ParquetWriter<>(path,
new TupleWriteSupport(),
CompressionCodecName.fromConf(compressionType),
ParquetWriter.DEFAULT_BLOCK_SIZE,
ParquetWriter.DEFAULT_PAGE_SIZE,
ParquetWriter.DEFAULT_PAGE_SIZE,
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
conf);
// Create Tuple object with ExaIterator reference.
this.row = new Tuple(exa, numColumns, firstColumnIndex, dynamicPartitionExaColNums);
}
示例2
public static void writeAvro(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException {
// Set up the Hadoop Input Format
Job job = Job.getInstance();
// Set up Hadoop Output Format
HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new AvroParquetOutputFormat(), job);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
AvroParquetOutputFormat.setSchema(job, Person.getClassSchema());
ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
ParquetOutputFormat.setEnableDictionary(job, true);
// Output & Execute
data.output(hadoopOutputFormat);
}
示例3
public static void writeThrift(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException {
// Set up the Hadoop Input Format
Job job = Job.getInstance();
// Set up Hadoop Output Format
HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
ParquetOutputFormat.setEnableDictionary(job, true);
ParquetThriftOutputFormat.setThriftClass(job, Person.class);
// Output & Execute
data.output(hadoopOutputFormat);
}
示例4
@Override
public void setCompression( COMPRESSION comp ) throws Exception {
inClassloader( () -> {
CompressionCodecName codec;
switch ( comp ) {
case SNAPPY:
codec = CompressionCodecName.SNAPPY;
break;
case GZIP:
codec = CompressionCodecName.GZIP;
break;
case LZO:
codec = CompressionCodecName.LZO;
break;
default:
codec = CompressionCodecName.UNCOMPRESSED;
break;
}
ParquetOutputFormat.setCompression( job, codec );
} );
}
示例5
public static void writeProtobuf(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException {
Job job = Job.getInstance();
// Set up Hadoop Output Format
HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ProtoParquetOutputFormat(), job);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
ProtoParquetOutputFormat.setProtobufClass(job, Person.class);
ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
ParquetOutputFormat.setEnableDictionary(job, true);
// Output & Execute
data.output(hadoopOutputFormat);
}
示例6
private static void createLineitems(ExecutionEnvironment env) throws IOException {
DataSet<Tuple2<Void,LineitemTable>> lineitems = getLineitemDataSet(env).map(new LineitemToParquet());
Job job = Job.getInstance();
HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);
ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/lineitems"));
ParquetThriftOutputFormat.setThriftClass(job, LineitemTable.class);
ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
ParquetThriftOutputFormat.setCompressOutput(job, true);
ParquetThriftOutputFormat.setEnableDictionary(job, true);
lineitems.output(hadoopOutputFormat);
}
示例7
private static void createOrders(ExecutionEnvironment env) throws IOException {
DataSet<Tuple2<Void,OrderTable>> orders = getOrdersDataSet(env).map(new OrdersToParquet());
Job job = Job.getInstance();
HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);
ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/orders"));
ParquetThriftOutputFormat.setThriftClass(job, OrderTable.class);
ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
ParquetThriftOutputFormat.setCompressOutput(job, true);
ParquetThriftOutputFormat.setEnableDictionary(job, true);
orders.output(hadoopOutputFormat);
}
示例8
private static void createCustomers(ExecutionEnvironment env) throws IOException {
DataSet<Tuple2<Void,CustomerTable>> customers = getCustomerDataSet(env).map(new CustomerToParquet());
Job job = Job.getInstance();
HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);
ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/cust"));
ParquetThriftOutputFormat.setThriftClass(job, CustomerTable.class);
ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
ParquetThriftOutputFormat.setCompressOutput(job, true);
ParquetThriftOutputFormat.setEnableDictionary(job, true);
customers.output(hadoopOutputFormat);
}
示例9
private static void createDateDim(ExecutionEnvironment env) throws IOException {
DataSet<Tuple2<Void, DateDimTable>> datedims = getDateDimDataSet(env).map(new DateDimToParquet());
Job job = Job.getInstance();
HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);
ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/datedim"));
ParquetThriftOutputFormat.setThriftClass(job, DateDimTable.class);
ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
ParquetThriftOutputFormat.setCompressOutput(job, true);
ParquetThriftOutputFormat.setEnableDictionary(job, false);
datedims.output(hadoopOutputFormat);
}
示例10
private static void createItem(ExecutionEnvironment env) throws IOException {
DataSet<Tuple2<Void, ItemTable>> items = getItemDataSet(env).map(new ItemToParquet());
Job job = Job.getInstance();
HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);
ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/item"));
ParquetThriftOutputFormat.setThriftClass(job, ItemTable.class);
ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
ParquetThriftOutputFormat.setCompressOutput(job, true);
ParquetThriftOutputFormat.setEnableDictionary(job, false);
items.output(hadoopOutputFormat);
}
示例11
private static void createStoreSales(ExecutionEnvironment env) throws IOException {
DataSet<Tuple2<Void, StoreSalesTable>> storeSales = getStoreSalesDataSet(env).map(new StoreSalesToParquet());
Job job = Job.getInstance();
HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);
ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/storesales"));
ParquetThriftOutputFormat.setThriftClass(job, StoreSalesTable.class);
ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
ParquetThriftOutputFormat.setCompressOutput(job, true);
ParquetThriftOutputFormat.setEnableDictionary(job, false);
storeSales.output(hadoopOutputFormat);
}
示例12
/**
* Write the file.
*
* @param args the command-line arguments
* @return the process exit code
* @throws Exception if something goes wrong
*/
public int run(final String[] args) throws Exception {
Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.IOFileOpts.values()).build();
int result = cli.runCmd();
if (result != 0) {
return result;
}
File inputFile = new File(cli.getArgValueAsString(CliCommonOpts.IOFileOpts.INPUT));
Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.IOFileOpts.OUTPUT));
AvroParquetWriter<Stock> writer =
new AvroParquetWriter<Stock>(outputPath, Stock.SCHEMA$,
CompressionCodecName.SNAPPY,
ParquetWriter.DEFAULT_BLOCK_SIZE,
ParquetWriter.DEFAULT_PAGE_SIZE,
true);
for (Stock stock : AvroStockUtils.fromCsvFile(inputFile)) {
writer.write(stock);
}
writer.close();
return 0;
}
示例13
public static ParquetMetadata readFooter(FileSystem fileSystem, Path file)
throws IOException
{
FileStatus fileStatus = fileSystem.getFileStatus(file);
try (FSDataInputStream inputStream = fileSystem.open(file)) {
// Parquet File Layout:
//
// MAGIC
// variable: Data
// variable: Metadata
// 4 bytes: MetadataLength
// MAGIC
long length = fileStatus.getLen();
validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length;
inputStream.seek(metadataLengthIndex);
int metadataLength = readIntLittleEndian(inputStream);
byte[] magic = new byte[MAGIC.length];
inputStream.readFully(magic);
validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));
long metadataIndex = metadataLengthIndex - metadataLength;
validateParquet(
metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
"Corrupted Parquet file: %s metadata index: %s out of range",
file,
metadataIndex);
inputStream.seek(metadataIndex);
FileMetaData fileMetaData = readFileMetaData(inputStream);
List<SchemaElement> schema = fileMetaData.getSchema();
validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);
MessageType messageType = readParquetSchema(schema);
List<BlockMetaData> blocks = new ArrayList<>();
List<RowGroup> rowGroups = fileMetaData.getRow_groups();
if (rowGroups != null) {
for (RowGroup rowGroup : rowGroups) {
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.setRowCount(rowGroup.getNum_rows());
blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
List<ColumnChunk> columns = rowGroup.getColumns();
validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
String filePath = columns.get(0).getFile_path();
for (ColumnChunk columnChunk : columns) {
validateParquet(
(filePath == null && columnChunk.getFile_path() == null)
|| (filePath != null && filePath.equals(columnChunk.getFile_path())),
"all column chunks of the same row group must be in the same file");
ColumnMetaData metaData = columnChunk.meta_data;
String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
ColumnPath columnPath = ColumnPath.get(path);
ColumnChunkMetaData column = ColumnChunkMetaData.get(
columnPath,
messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(),
CompressionCodecName.fromParquet(metaData.codec),
readEncodings(metaData.encodings),
readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()),
metaData.data_page_offset,
metaData.dictionary_page_offset,
metaData.num_values,
metaData.total_compressed_size,
metaData.total_uncompressed_size);
blockMetaData.addColumn(column);
}
blockMetaData.setPath(filePath);
blocks.add(blockMetaData);
}
}
Map<String, String> keyValueMetaData = new HashMap<>();
List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
if (keyValueList != null) {
for (KeyValue keyValue : keyValueList) {
keyValueMetaData.put(keyValue.key, keyValue.value);
}
}
return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
}
}
示例14
/**
* Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration}
* @param writerConfiguration
* @return
* @throws IOException
*/
@Override
public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration)
throws IOException {
CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName());
ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion
.fromString(writerConfiguration.getWriterVersion());
Configuration conf = new Configuration();
ParquetWriter versionSpecificWriter = null;
switch (writerConfiguration.getRecordFormat()) {
case GROUP: {
GroupWriteSupport.setSchema((MessageType) this.schema, conf);
WriteSupport support = new GroupWriteSupport();
versionSpecificWriter = new ParquetWriter<Group>(
writerConfiguration.getAbsoluteStagingFile(),
support,
codecName,
writerConfiguration.getBlockSize(),
writerConfiguration.getPageSize(),
writerConfiguration.getDictPageSize(),
writerConfiguration.isDictionaryEnabled(),
writerConfiguration.isValidate(),
writerVersion,
conf);
break;
}
case AVRO: {
versionSpecificWriter = new AvroParquetWriter(
writerConfiguration.getAbsoluteStagingFile(),
(Schema) this.schema,
codecName,
writerConfiguration.getBlockSize(),
writerConfiguration.getPageSize(),
writerConfiguration.isDictionaryEnabled(),
conf);
break;
}
case PROTOBUF: {
versionSpecificWriter = new ProtoParquetWriter(
writerConfiguration.getAbsoluteStagingFile(),
(Class<? extends Message>) this.schema,
codecName,
writerConfiguration.getBlockSize(),
writerConfiguration.getPageSize(),
writerConfiguration.isDictionaryEnabled(),
writerConfiguration.isValidate());
break;
}
default: throw new RuntimeException("Record format not supported");
}
ParquetWriter finalVersionSpecificWriter = versionSpecificWriter;
return new ParquetWriterShim() {
@Override
public void write(Object record)
throws IOException {
finalVersionSpecificWriter.write(record);
}
@Override
public void close()
throws IOException {
finalVersionSpecificWriter.close();
}
};
}
示例15
public int run(String[] args) throws Exception {
if(args.length < 2) {
LOG.error("Usage: " + getClass().getName() + " INPUTFILE OUTPUTFILE [compression]");
return 1;
}
String inputFile = args[0];
String outputFile = args[1];
String compression = (args.length > 2) ? args[2] : "none";
Path parquetFilePath = null;
// Find a file in case a directory was passed
RemoteIterator<LocatedFileStatus> it = FileSystem.get(getConf()).listFiles(new Path(inputFile), true);
while(it.hasNext()) {
FileStatus fs = it.next();
if(fs.isFile()) {
parquetFilePath = fs.getPath();
break;
}
}
if(parquetFilePath == null) {
LOG.error("No file found for " + inputFile);
return 1;
}
LOG.info("Getting schema from " + parquetFilePath);
ParquetMetadata readFooter = ParquetFileReader.readFooter(getConf(), parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
LOG.info(schema);
GroupWriteSupport.setSchema(schema, getConf());
Job job = new Job(getConf());
job.setJarByClass(getClass());
job.setJobName(getClass().getName());
job.setMapperClass(ReadRequestMap.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(ExampleInputFormat.class);
job.setOutputFormatClass(ExampleOutputFormat.class);
CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
if(compression.equalsIgnoreCase("snappy")) {
codec = CompressionCodecName.SNAPPY;
} else if(compression.equalsIgnoreCase("gzip")) {
codec = CompressionCodecName.GZIP;
}
LOG.info("Output compression: " + codec);
ExampleOutputFormat.setCompression(job, codec);
FileInputFormat.setInputPaths(job, new Path(inputFile));
FileOutputFormat.setOutputPath(job, new Path(outputFile));
job.waitForCompletion(true);
return 0;
}