Java源码示例:org.apache.parquet.format.SchemaElement
示例1
/**
* Map full schema paths in format `a`.`b`.`c` to respective SchemaElement objects.
*
* @param footer Parquet file metadata
* @return schema full path to SchemaElement map
*/
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
Map<String, SchemaElement> schemaElements = new HashMap<>();
FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
Iterator<SchemaElement> iter = fileMetaData.getSchema().iterator();
// First element in collection is default `root` element. We skip it to maintain key in `a` format instead of `root`.`a`,
// and thus to avoid the need to cut it out again when comparing with SchemaPath string representation
if (iter.hasNext()) {
iter.next();
}
while (iter.hasNext()) {
addSchemaElementMapping(iter, new StringBuilder(), schemaElements);
}
return schemaElements;
}
示例2
/**
* Populate full path to SchemaElement map by recursively traversing schema elements referenced by the given iterator
*
* @param iter file schema values iterator
* @param path parent schema element path
* @param schemaElements schema elements map to insert next iterator element into
*/
private static void addSchemaElementMapping(Iterator<SchemaElement> iter, StringBuilder path,
Map<String, SchemaElement> schemaElements) {
SchemaElement schemaElement = iter.next();
path.append('`').append(schemaElement.getName().toLowerCase()).append('`');
schemaElements.put(path.toString(), schemaElement);
// for each element that has children we need to maintain remaining children count
// to exit current recursion level when no more children is left
int remainingChildren = schemaElement.getNum_children();
while (remainingChildren > 0 && iter.hasNext()) {
addSchemaElementMapping(iter, new StringBuilder(path).append('.'), schemaElements);
remainingChildren--;
}
return;
}
示例3
@Test
public void testIncompatibleLogicalAndConvertedTypes() {
ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
MessageType schema = Types.buildMessage()
.required(PrimitiveTypeName.BINARY)
.as(OriginalType.DECIMAL).precision(9).scale(2)
.named("aBinary")
.named("Message");
MessageType expected = Types.buildMessage()
.required(PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.jsonType())
.named("aBinary")
.named("Message");
List<SchemaElement> parquetSchema = parquetMetadataConverter.toParquetSchema(schema);
// Set converted type field to a different type to verify that in case of mismatch, it overrides logical type
parquetSchema.get(1).setConverted_type(ConvertedType.JSON);
MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
assertEquals(expected, actual);
}
示例4
private FileMetaData metadata(long... sizes) {
List<SchemaElement> schema = emptyList();
List<RowGroup> rowGroups = new ArrayList<RowGroup>();
long offset = 0;
for (long size : sizes) {
ColumnChunk columnChunk = new ColumnChunk(offset);
columnChunk.setMeta_data(new ColumnMetaData(
INT32,
Collections.<org.apache.parquet.format.Encoding>emptyList(),
Collections.<String>emptyList(),
UNCOMPRESSED, 10l, size * 2, size, offset));
rowGroups.add(new RowGroup(Arrays.asList(columnChunk), size, 1));
offset += size;
}
return new FileMetaData(1, schema, sizes.length, rowGroups);
}
示例5
VarLengthValuesColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v,
SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
variableWidthVector = (VariableWidthVector) valueVec;
if (columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
usingDictionary = true;
// We didn't implement the fixed length optimization when a Parquet Dictionary is used; as there are
// no data point about this use-case. Will also enable bulk processing by default since early data
// profiling (for detecting the best processing strategy to use) is disabled when the column precision
// is already set.
bulkReaderState.columnPrecInfo.columnPrecisionType = ColumnPrecisionType.DT_PRECISION_IS_VARIABLE;
bulkReaderState.columnPrecInfo.bulkProcess = true;
}
else {
usingDictionary = false;
}
}
示例6
protected ColumnReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v, SchemaElement schemaElement) throws ExecutionSetupException {
this.parentReader = parentReader;
this.columnDescriptor = descriptor;
this.columnChunkMetaData = columnChunkMetaData;
this.isFixedLength = fixedLength;
this.schemaElement = schemaElement;
this.valueVec = v;
this.pageReader = (parentReader.getSingleStream() != null)?
new DeprecatedSingleStreamPageReader(this, parentReader.getSingleStream(), parentReader.getFsPath(), columnChunkMetaData) :
new PageReader(this, parentReader.getFileSystem(), parentReader.getFsPath(), columnChunkMetaData);
if (columnDescriptor.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
if (columnDescriptor.getType() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
dataTypeLengthInBits = columnDescriptor.getTypeLength() * 8;
} else if (columnDescriptor.getType() == PrimitiveTypeName.INT96
&& valueVec instanceof TimeStampMilliVector) {
// if int 96 column is being read as a Timestamp, this truncates the time format used by Impala
// dataTypeLengthInBits is only ever used when computing offsets into the destination vector, so it
// needs to be set to the bit width of the resulting Arrow type, usually this matches the input length
dataTypeLengthInBits = 64;
} else {
dataTypeLengthInBits = DeprecatedParquetVectorizedReader.getTypeLengthInBits(columnDescriptor.getType());
}
}
}
示例7
VarDecimalColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarDecimalVector v,
SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
this.varDecimalVector = v;
this.mutator = v.getMutator();
}
示例8
NullableVarDecimalColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, NullableVarDecimalVector v,
SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
nullableVarDecimalVector = v;
this.mutator = v.getMutator();
}
示例9
VarCharColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarCharVector v,
SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
this.varCharVector = v;
this.mutator = v.getMutator();
}
示例10
VarBinaryColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarBinaryVector v,
SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
this.varBinaryVector = v;
this.mutator = v.getMutator();
}
示例11
NullableVarBinaryColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, NullableVarBinaryVector v,
SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
this.nullableVarBinaryVector = v;
this.mutator = v.getMutator();
}
示例12
VarLengthColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v,
SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
if (columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
usingDictionary = true;
}
else {
usingDictionary = false;
}
}
示例13
CorruptionDetectingNullableDateReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize,
ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData,
boolean fixedLength, DateMilliVector v, SchemaElement schemaElement)
throws ExecutionSetupException {
super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
dateVector = v;
}
示例14
@Test
public void testLogicalTypesBackwardCompatibleWithConvertedTypes() {
ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
MessageType expected = Types.buildMessage()
.required(PrimitiveTypeName.BINARY)
.as(OriginalType.DECIMAL).precision(9).scale(2)
.named("aBinaryDecimal")
.named("Message");
List<SchemaElement> parquetSchema = parquetMetadataConverter.toParquetSchema(expected);
// Set logical type field to null to test backward compatibility with files written by older API,
// where converted_types are written to the metadata, but logicalType is missing
parquetSchema.get(1).setLogicalType(null);
MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
assertEquals(expected, schema);
}
示例15
public static TypeProtos.MajorType toMajorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length,
TypeProtos.DataMode mode, SchemaElement schemaElement,
OptionManager options, Field arrowField, final boolean readInt96AsTimeStamp) {
MinorType minorType = getMinorType(primitiveTypeName, length, schemaElement, options, arrowField, readInt96AsTimeStamp);
TypeProtos.MajorType.Builder typeBuilder = TypeProtos.MajorType.newBuilder().setMinorType(minorType).setMode(mode);
if (CoreDecimalUtility.isDecimalType(minorType)) {
typeBuilder.setPrecision(schemaElement.getPrecision()).setScale(schemaElement.getScale());
}
return typeBuilder.build();
}
示例16
public void resolveDrillType(Map<String, SchemaElement> schemaElements, OptionManager options) {
se = schemaElements.get(ParquetReaderUtility.getFullColumnPath(column));
type = ParquetToDrillTypeConverter.toMajorType(column.getType(), column.getTypeLength(),
getDataMode(column), se, options);
field = MaterializedField.create(toFieldName(column.getPath()).getLastSegment().getNameSegment().getPath(), type);
length = getDataTypeLength();
}
示例17
static VarLengthValuesColumn<?> getReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v,
SchemaElement schemaElement
) throws ExecutionSetupException {
ConvertedType convertedType = schemaElement.getConverted_type();
switch (descriptor.getMaxDefinitionLevel()) {
case 0:
if (convertedType == null) {
return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
}
switch (convertedType) {
case UTF8:
return new VarLengthColumnReaders.VarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
case DECIMAL:
return new VarLengthColumnReaders.Decimal28Column(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement);
default:
return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
}
default:
if (convertedType == null) {
return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
}
switch (convertedType) {
case UTF8:
return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
case DECIMAL:
return new NullableDecimalColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement);
default:
return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
}
}
}
示例18
static VarLengthValuesColumn<?> getReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v,
SchemaElement schemaElement
) throws ExecutionSetupException {
ConvertedType convertedType = schemaElement.getConverted_type();
switch (descriptor.getMaxDefinitionLevel()) {
case 0:
if (convertedType == null) {
return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
}
switch (convertedType) {
case UTF8:
case ENUM:
return new VarLengthColumnReaders.VarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
case DECIMAL:
if (v instanceof VarDecimalVector) {
return new VarLengthColumnReaders.VarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarDecimalVector) v, schemaElement);
}
default:
return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
}
default:
if (convertedType == null) {
return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
}
switch (convertedType) {
case UTF8:
case ENUM:
return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarCharVector) v, schemaElement);
case DECIMAL:
if (v instanceof NullableVarDecimalVector) {
return new VarLengthColumnReaders.NullableVarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) v, schemaElement);
}
default:
return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
}
}
}
示例19
NullableColumnReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData,
boolean fixedLength, V v, SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
castedBaseVector = (BaseDataValueVector) v;
castedVectorMutator = (NullableVectorDefinitionSetter) v.getMutator();
}
示例20
public static TypeProtos.MajorType toMajorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length,
TypeProtos.DataMode mode, SchemaElement schemaElement,
OptionManager options) {
ConvertedType convertedType = schemaElement.getConverted_type();
MinorType minorType = getMinorType(primitiveTypeName, length, convertedType, options);
TypeProtos.MajorType.Builder typeBuilder = TypeProtos.MajorType.newBuilder().setMinorType(minorType).setMode(mode);
if (Types.isDecimalType(minorType)) {
int precision = schemaElement.getPrecision();
int scale = schemaElement.getScale();
typeBuilder.setPrecision(precision).setScale(scale);
}
return typeBuilder.build();
}
示例21
private static MessageType readParquetSchema(List<SchemaElement> schema)
{
Iterator<SchemaElement> schemaIterator = schema.iterator();
SchemaElement rootSchema = schemaIterator.next();
Types.MessageTypeBuilder builder = Types.buildMessage();
readTypeSchema(builder, schemaIterator, rootSchema.getNum_children());
return builder.named(rootSchema.name);
}
示例22
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
HashMap<String, SchemaElement> schemaElements = new HashMap<>();
FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
for (SchemaElement se : fileMetaData.getSchema()) {
schemaElements.put(se.getName(), se);
}
return schemaElements;
}
示例23
VarLengthColumn(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v,
SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
if (columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
usingDictionary = true;
}
else {
usingDictionary = false;
}
}
示例24
@Test
public void testSchemaConverterDecimal() {
ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
List<SchemaElement> schemaElements = parquetMetadataConverter.toParquetSchema(
Types.buildMessage()
.required(PrimitiveTypeName.BINARY)
.as(OriginalType.DECIMAL).precision(9).scale(2)
.named("aBinaryDecimal")
.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(4)
.as(OriginalType.DECIMAL).precision(9).scale(2)
.named("aFixedDecimal")
.named("Message")
);
List<SchemaElement> expected = Lists.newArrayList(
new SchemaElement("Message").setNum_children(2),
new SchemaElement("aBinaryDecimal")
.setRepetition_type(FieldRepetitionType.REQUIRED)
.setType(Type.BYTE_ARRAY)
.setConverted_type(ConvertedType.DECIMAL)
.setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9)))
.setPrecision(9).setScale(2),
new SchemaElement("aFixedDecimal")
.setRepetition_type(FieldRepetitionType.OPTIONAL)
.setType(Type.FIXED_LEN_BYTE_ARRAY)
.setType_length(4)
.setConverted_type(ConvertedType.DECIMAL)
.setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9)))
.setPrecision(9).setScale(2)
);
Assert.assertEquals(expected, schemaElements);
}
示例25
/**
* Detect corrupt date values by looking at the min/max values in the metadata.
*
* This should only be used when a file does not have enough metadata to determine if
* the data was written with an external tool or an older version of Drill
* ({@link org.apache.drill.exec.store.parquet.ParquetRecordWriter#WRITER_VERSION_PROPERTY} <
* {@link org.apache.drill.exec.store.parquet.ParquetReaderUtility#DRILL_WRITER_VERSION_STD_DATE_FORMAT})
*
* This method only checks the first Row Group, because Drill has only ever written
* a single Row Group per file.
*
* @param footer parquet footer
* @param columns list of columns schema path
* @param autoCorrectCorruptDates user setting to allow enabling/disabling of auto-correction
* of corrupt dates. There are some rare cases (storing dates thousands
* of years into the future, with tools other than Drill writing files)
* that would result in the date values being "corrected" into bad values.
*/
public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(ParquetMetadata footer,
List<SchemaPath> columns,
boolean autoCorrectCorruptDates) {
// Users can turn-off date correction in cases where we are detecting corruption based on the date values
// that are unlikely to appear in common datasets. In this case report that no correction needs to happen
// during the file read
if (! autoCorrectCorruptDates) {
return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
}
// Drill produced files have only ever have a single row group, if this changes in the future it won't matter
// as we will know from the Drill version written in the files that the dates are correct
int rowGroupIndex = 0;
Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
findDateColWithStatsLoop : for (SchemaPath schemaPath : columns) {
List<ColumnDescriptor> parquetColumns = footer.getFileMetaData().getSchema().getColumns();
for (int i = 0; i < parquetColumns.size(); ++i) {
ColumnDescriptor column = parquetColumns.get(i);
// this reader only supports flat data, this is restricted in the ParquetScanBatchCreator
// creating a NameSegment makes sure we are using the standard code for comparing names,
// currently it is all case-insensitive
if (Utilities.isStarQuery(columns)
|| getFullColumnPath(column).equalsIgnoreCase(schemaPath.getUnIndexed().toString())) {
int colIndex = -1;
ConvertedType convertedType = schemaElements.get(getFullColumnPath(column)).getConverted_type();
if (convertedType != null && convertedType.equals(ConvertedType.DATE)) {
List<ColumnChunkMetaData> colChunkList = footer.getBlocks().get(rowGroupIndex).getColumns();
for (int j = 0; j < colChunkList.size(); j++) {
if (colChunkList.get(j).getPath().equals(ColumnPath.get(column.getPath()))) {
colIndex = j;
break;
}
}
}
if (colIndex == -1) {
// column does not appear in this file, skip it
continue;
}
IntStatistics statistics = (IntStatistics) footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics();
return (statistics.hasNonNullValue() && statistics.compareMaxToValue(ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) > 0) ?
DateCorruptionStatus.META_SHOWS_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
}
}
}
return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
}
示例26
NullableDateReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData,
boolean fixedLength, DateMilliVector v, SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
}
示例27
BitReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData,
boolean fixedLength, BitVector v, SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
}
示例28
CorruptDateReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData,
boolean fixedLength, DateMilliVector v, SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
vector = v;
}
示例29
NullableFixedByteAlignedReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v, SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
}
示例30
NullableDictionaryIntReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, NullableIntVector v,
SchemaElement schemaElement) throws ExecutionSetupException {
super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
}