Java源码示例:org.apache.parquet.format.ConvertedType
示例1
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) {
boolean signed = intLogicalType.isSigned();
switch (intLogicalType.getBitWidth()) {
case 8:
return of(signed ? ConvertedType.INT_8 : ConvertedType.UINT_8);
case 16:
return of(signed ? ConvertedType.INT_16 : ConvertedType.UINT_16);
case 32:
return of(signed ? ConvertedType.INT_32 : ConvertedType.UINT_32);
case 64:
return of(signed ? ConvertedType.INT_64 : ConvertedType.UINT_64);
default:
throw new RuntimeException("Unknown original type " + intLogicalType.toOriginalType());
}
}
示例2
@Test
public void testIncompatibleLogicalAndConvertedTypes() {
ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
MessageType schema = Types.buildMessage()
.required(PrimitiveTypeName.BINARY)
.as(OriginalType.DECIMAL).precision(9).scale(2)
.named("aBinary")
.named("Message");
MessageType expected = Types.buildMessage()
.required(PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.jsonType())
.named("aBinary")
.named("Message");
List<SchemaElement> parquetSchema = parquetMetadataConverter.toParquetSchema(schema);
// Set converted type field to a different type to verify that in case of mismatch, it overrides logical type
parquetSchema.get(1).setConverted_type(ConvertedType.JSON);
MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
assertEquals(expected, actual);
}
示例3
static VarLengthValuesColumn<?> getReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v,
SchemaElement schemaElement
) throws ExecutionSetupException {
ConvertedType convertedType = schemaElement.getConverted_type();
switch (descriptor.getMaxDefinitionLevel()) {
case 0:
if (convertedType == null) {
return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
}
switch (convertedType) {
case UTF8:
case ENUM:
return new VarLengthColumnReaders.VarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
case DECIMAL:
if (v instanceof VarDecimalVector) {
return new VarLengthColumnReaders.VarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarDecimalVector) v, schemaElement);
}
default:
return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
}
default:
if (convertedType == null) {
return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
}
switch (convertedType) {
case UTF8:
case ENUM:
return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarCharVector) v, schemaElement);
case DECIMAL:
if (v instanceof NullableVarDecimalVector) {
return new VarLengthColumnReaders.NullableVarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) v, schemaElement);
}
default:
return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
}
}
}
示例4
public static TypeProtos.MajorType toMajorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length,
TypeProtos.DataMode mode, SchemaElement schemaElement,
OptionManager options) {
ConvertedType convertedType = schemaElement.getConverted_type();
MinorType minorType = getMinorType(primitiveTypeName, length, convertedType, options);
TypeProtos.MajorType.Builder typeBuilder = TypeProtos.MajorType.newBuilder().setMinorType(minorType).setMode(mode);
if (Types.isDecimalType(minorType)) {
int precision = schemaElement.getPrecision();
int scale = schemaElement.getScale();
typeBuilder.setPrecision(precision).setScale(scale);
}
return typeBuilder.build();
}
示例5
static VarLengthValuesColumn<?> getReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v,
SchemaElement schemaElement
) throws ExecutionSetupException {
ConvertedType convertedType = schemaElement.getConverted_type();
switch (descriptor.getMaxDefinitionLevel()) {
case 0:
if (convertedType == null) {
return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
}
switch (convertedType) {
case UTF8:
return new VarLengthColumnReaders.VarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
case DECIMAL:
return new VarLengthColumnReaders.Decimal28Column(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement);
default:
return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
}
default:
if (convertedType == null) {
return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
}
switch (convertedType) {
case UTF8:
return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
case DECIMAL:
return new NullableDecimalColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement);
default:
return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
}
}
}
示例6
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) {
switch (timeLogicalType.getUnit()) {
case MILLIS:
return of(ConvertedType.TIME_MILLIS);
case MICROS:
return of(ConvertedType.TIME_MICROS);
case NANOS:
return empty();
default:
throw new RuntimeException("Unknown converted type for " + timeLogicalType.toOriginalType());
}
}
示例7
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) {
switch (timestampLogicalType.getUnit()) {
case MICROS:
return of(ConvertedType.TIMESTAMP_MICROS);
case MILLIS:
return of(ConvertedType.TIMESTAMP_MILLIS);
case NANOS:
return empty();
default:
throw new RuntimeException("Unknown converted type for " + timestampLogicalType.toOriginalType());
}
}
示例8
@Test
public void testSchemaConverterDecimal() {
ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
List<SchemaElement> schemaElements = parquetMetadataConverter.toParquetSchema(
Types.buildMessage()
.required(PrimitiveTypeName.BINARY)
.as(OriginalType.DECIMAL).precision(9).scale(2)
.named("aBinaryDecimal")
.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(4)
.as(OriginalType.DECIMAL).precision(9).scale(2)
.named("aFixedDecimal")
.named("Message")
);
List<SchemaElement> expected = Lists.newArrayList(
new SchemaElement("Message").setNum_children(2),
new SchemaElement("aBinaryDecimal")
.setRepetition_type(FieldRepetitionType.REQUIRED)
.setType(Type.BYTE_ARRAY)
.setConverted_type(ConvertedType.DECIMAL)
.setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9)))
.setPrecision(9).setScale(2),
new SchemaElement("aFixedDecimal")
.setRepetition_type(FieldRepetitionType.OPTIONAL)
.setType(Type.FIXED_LEN_BYTE_ARRAY)
.setType_length(4)
.setConverted_type(ConvertedType.DECIMAL)
.setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9)))
.setPrecision(9).setScale(2)
);
Assert.assertEquals(expected, schemaElements);
}
示例9
/**
* Detect corrupt date values by looking at the min/max values in the metadata.
*
* This should only be used when a file does not have enough metadata to determine if
* the data was written with an external tool or an older version of Drill
* ({@link org.apache.drill.exec.store.parquet.ParquetRecordWriter#WRITER_VERSION_PROPERTY} <
* {@link org.apache.drill.exec.store.parquet.ParquetReaderUtility#DRILL_WRITER_VERSION_STD_DATE_FORMAT})
*
* This method only checks the first Row Group, because Drill has only ever written
* a single Row Group per file.
*
* @param footer parquet footer
* @param columns list of columns schema path
* @param autoCorrectCorruptDates user setting to allow enabling/disabling of auto-correction
* of corrupt dates. There are some rare cases (storing dates thousands
* of years into the future, with tools other than Drill writing files)
* that would result in the date values being "corrected" into bad values.
*/
public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(ParquetMetadata footer,
List<SchemaPath> columns,
boolean autoCorrectCorruptDates) {
// Users can turn-off date correction in cases where we are detecting corruption based on the date values
// that are unlikely to appear in common datasets. In this case report that no correction needs to happen
// during the file read
if (! autoCorrectCorruptDates) {
return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
}
// Drill produced files have only ever have a single row group, if this changes in the future it won't matter
// as we will know from the Drill version written in the files that the dates are correct
int rowGroupIndex = 0;
Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
findDateColWithStatsLoop : for (SchemaPath schemaPath : columns) {
List<ColumnDescriptor> parquetColumns = footer.getFileMetaData().getSchema().getColumns();
for (int i = 0; i < parquetColumns.size(); ++i) {
ColumnDescriptor column = parquetColumns.get(i);
// this reader only supports flat data, this is restricted in the ParquetScanBatchCreator
// creating a NameSegment makes sure we are using the standard code for comparing names,
// currently it is all case-insensitive
if (Utilities.isStarQuery(columns)
|| getFullColumnPath(column).equalsIgnoreCase(schemaPath.getUnIndexed().toString())) {
int colIndex = -1;
ConvertedType convertedType = schemaElements.get(getFullColumnPath(column)).getConverted_type();
if (convertedType != null && convertedType.equals(ConvertedType.DATE)) {
List<ColumnChunkMetaData> colChunkList = footer.getBlocks().get(rowGroupIndex).getColumns();
for (int j = 0; j < colChunkList.size(); j++) {
if (colChunkList.get(j).getPath().equals(ColumnPath.get(column.getPath()))) {
colIndex = j;
break;
}
}
}
if (colIndex == -1) {
// column does not appear in this file, skip it
continue;
}
IntStatistics statistics = (IntStatistics) footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics();
return (statistics.hasNonNullValue() && statistics.compareMaxToValue(ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) > 0) ?
DateCorruptionStatus.META_SHOWS_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
}
}
}
return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
}
示例10
public static NullableColumnReader<?> getNullableColumnReader(ParquetRecordReader parentReader,
ColumnDescriptor columnDescriptor,
ColumnChunkMetaData columnChunkMetaData,
boolean fixedLength,
ValueVector valueVec,
SchemaElement schemaElement) throws ExecutionSetupException {
ConvertedType convertedType = schemaElement.getConverted_type();
if (! columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
if (columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.INT96) {
// TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation.
if (parentReader.getFragmentContext().getOptions().getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) {
return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableTimeStampVector) valueVec, schemaElement);
} else {
return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableVarBinaryVector) valueVec, schemaElement);
}
} else if (convertedType == ConvertedType.DECIMAL) {
// NullableVarDecimalVector allows storing of values with different width,
// so every time when the value is added, offset vector should be updated.
// Therefore NullableVarDecimalReader is used here instead of NullableFixedByteAlignedReader.
return new NullableFixedByteAlignedReaders.NullableVarDecimalReader(parentReader,
columnDescriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) valueVec, schemaElement);
} else {
return new NullableFixedByteAlignedReaders.NullableFixedByteAlignedReader<>(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, valueVec, schemaElement);
}
} else {
switch (columnDescriptor.getType()) {
case INT32:
if (convertedType == null) {
return new NullableFixedByteAlignedReaders.NullableDictionaryIntReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableIntVector) valueVec, schemaElement);
}
switch (convertedType) {
case DECIMAL:
return new NullableFixedByteAlignedReaders.NullableDictionaryVarDecimalReader(parentReader,
columnDescriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) valueVec, schemaElement);
case TIME_MILLIS:
return new NullableFixedByteAlignedReaders.NullableDictionaryTimeReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableTimeVector)valueVec, schemaElement);
default:
throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT32");
}
case INT64:
if (convertedType == null) {
return new NullableFixedByteAlignedReaders.NullableDictionaryBigIntReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableBigIntVector)valueVec, schemaElement);
}
switch (convertedType) {
case DECIMAL:
return new NullableFixedByteAlignedReaders.NullableDictionaryVarDecimalReader(parentReader,
columnDescriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) valueVec, schemaElement);
case TIMESTAMP_MILLIS:
return new NullableFixedByteAlignedReaders.NullableDictionaryTimeStampReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableTimeStampVector)valueVec, schemaElement);
// DRILL-6670: handle TIMESTAMP_MICROS as INT64 with no logical type
case TIMESTAMP_MICROS:
return new NullableFixedByteAlignedReaders.NullableDictionaryBigIntReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableBigIntVector)valueVec, schemaElement);
default:
throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT64");
}
case INT96:
// TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation.
if (parentReader.getFragmentContext().getOptions().getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) {
return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableTimeStampVector) valueVec, schemaElement);
} else {
return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableVarBinaryVector) valueVec, schemaElement);
}
case FLOAT:
return new NullableFixedByteAlignedReaders.NullableDictionaryFloat4Reader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableFloat4Vector)valueVec, schemaElement);
case DOUBLE:
return new NullableFixedByteAlignedReaders.NullableDictionaryFloat8Reader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableFloat8Vector)valueVec, schemaElement);
default:
throw new ExecutionSetupException("Unsupported nullable column type " + columnDescriptor.getType().name() );
}
}
}
示例11
private static TypeProtos.MinorType getMinorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length,
ConvertedType convertedType, OptionManager options) {
switch (primitiveTypeName) {
case BINARY:
if (convertedType == null) {
return (TypeProtos.MinorType.VARBINARY);
}
switch (convertedType) {
case UTF8:
case ENUM:
return (TypeProtos.MinorType.VARCHAR);
case DECIMAL:
ParquetReaderUtility.checkDecimalTypeEnabled(options);
return TypeProtos.MinorType.VARDECIMAL;
default:
return (TypeProtos.MinorType.VARBINARY);
}
case INT64:
if (convertedType == null) {
return (TypeProtos.MinorType.BIGINT);
}
switch(convertedType) {
// DRILL-6670: handle TIMESTAMP_MICROS as INT64 with no logical type
case INT_64:
case TIMESTAMP_MICROS:
return TypeProtos.MinorType.BIGINT;
case UINT_64:
return TypeProtos.MinorType.UINT8;
case DECIMAL:
ParquetReaderUtility.checkDecimalTypeEnabled(options);
return TypeProtos.MinorType.VARDECIMAL;
case TIMESTAMP_MILLIS:
return TypeProtos.MinorType.TIMESTAMP;
default:
throw new UnsupportedOperationException(String.format("unsupported type: %s %s", primitiveTypeName, convertedType));
}
case INT32:
if (convertedType == null) {
return TypeProtos.MinorType.INT;
}
switch(convertedType) {
case UINT_8:
case UINT_16:
case UINT_32:
return TypeProtos.MinorType.UINT4;
case INT_8:
case INT_16:
case INT_32:
return TypeProtos.MinorType.INT;
case DECIMAL:
ParquetReaderUtility.checkDecimalTypeEnabled(options);
return TypeProtos.MinorType.VARDECIMAL;
case DATE:
return TypeProtos.MinorType.DATE;
case TIME_MILLIS:
return TypeProtos.MinorType.TIME;
default:
throw new UnsupportedOperationException(String.format("unsupported type: %s %s", primitiveTypeName, convertedType));
}
case BOOLEAN:
return TypeProtos.MinorType.BIT;
case FLOAT:
return TypeProtos.MinorType.FLOAT4;
case DOUBLE:
return TypeProtos.MinorType.FLOAT8;
// TODO - Both of these are not supported by the parquet library yet (7/3/13),
// but they are declared here for when they are implemented
case INT96:
if (options.getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) {
return TypeProtos.MinorType.TIMESTAMP;
} else {
return TypeProtos.MinorType.VARBINARY;
}
case FIXED_LEN_BYTE_ARRAY:
if (convertedType == null) {
checkArgument(length > 0, "A length greater than zero must be provided for a FixedBinary type.");
return TypeProtos.MinorType.VARBINARY;
} else if (convertedType == ConvertedType.DECIMAL) {
ParquetReaderUtility.checkDecimalTypeEnabled(options);
return TypeProtos.MinorType.VARDECIMAL;
} else if (convertedType == ConvertedType.INTERVAL) {
return TypeProtos.MinorType.INTERVAL;
}
default:
throw new UnsupportedOperationException("Type not supported: " + primitiveTypeName);
}
}
示例12
private static ConvertedType getConvertedType(OriginalType type)
{
switch (type) {
case UTF8:
return ConvertedType.UTF8;
case MAP:
return ConvertedType.MAP;
case MAP_KEY_VALUE:
return ConvertedType.MAP_KEY_VALUE;
case LIST:
return ConvertedType.LIST;
case ENUM:
return ConvertedType.ENUM;
case DECIMAL:
return ConvertedType.DECIMAL;
case DATE:
return ConvertedType.DATE;
case TIME_MILLIS:
return ConvertedType.TIME_MILLIS;
case TIMESTAMP_MILLIS:
return ConvertedType.TIMESTAMP_MILLIS;
case INTERVAL:
return ConvertedType.INTERVAL;
case INT_8:
return ConvertedType.INT_8;
case INT_16:
return ConvertedType.INT_16;
case INT_32:
return ConvertedType.INT_32;
case INT_64:
return ConvertedType.INT_64;
case UINT_8:
return ConvertedType.UINT_8;
case UINT_16:
return ConvertedType.UINT_16;
case UINT_32:
return ConvertedType.UINT_32;
case UINT_64:
return ConvertedType.UINT_64;
case JSON:
return ConvertedType.JSON;
case BSON:
return ConvertedType.BSON;
default:
throw new RuntimeException("Unknown original type " + type);
}
}
示例13
private static OriginalType getOriginalType(ConvertedType type)
{
switch (type) {
case UTF8:
return OriginalType.UTF8;
case MAP:
return OriginalType.MAP;
case MAP_KEY_VALUE:
return OriginalType.MAP_KEY_VALUE;
case LIST:
return OriginalType.LIST;
case ENUM:
return OriginalType.ENUM;
case DECIMAL:
return OriginalType.DECIMAL;
case DATE:
return OriginalType.DATE;
case TIME_MILLIS:
return OriginalType.TIME_MILLIS;
case TIMESTAMP_MILLIS:
return OriginalType.TIMESTAMP_MILLIS;
case INTERVAL:
return OriginalType.INTERVAL;
case INT_8:
return OriginalType.INT_8;
case INT_16:
return OriginalType.INT_16;
case INT_32:
return OriginalType.INT_32;
case INT_64:
return OriginalType.INT_64;
case UINT_8:
return OriginalType.UINT_8;
case UINT_16:
return OriginalType.UINT_16;
case UINT_32:
return OriginalType.UINT_32;
case UINT_64:
return OriginalType.UINT_64;
case JSON:
return OriginalType.JSON;
case BSON:
return OriginalType.BSON;
case TIMESTAMP_MICROS:
return OriginalType.TIMESTAMP_MICROS;
case TIME_MICROS:
return OriginalType.TIME_MICROS;
default:
throw new IllegalArgumentException("Unknown converted type " + type);
}
}
示例14
/**
* Detect corrupt date values by looking at the min/max values in the metadata.
*
* This should only be used when a file does not have enough metadata to determine if
* the data was written with an older version of Drill, or an external tool. Drill
* versions 1.3 and beyond should have enough metadata to confirm that the data was written
* by Drill.
*
* This method only checks the first Row Group, because Drill has only ever written
* a single Row Group per file.
*
* @param footer
* @param columns
* @param autoCorrectCorruptDates user setting to allow enabling/disabling of auto-correction
* of corrupt dates. There are some rare cases (storing dates thousands
* of years into the future, with tools other than Drill writing files)
* that would result in the date values being "corrected" into bad values.
*/
public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(ParquetMetadata footer,
List<SchemaPath> columns,
boolean autoCorrectCorruptDates) {
// Users can turn-off date correction in cases where we are detecting corruption based on the date values
// that are unlikely to appear in common datasets. In this case report that no correction needs to happen
// during the file read
if (! autoCorrectCorruptDates) {
return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
}
// Drill produced files have only ever have a single row group, if this changes in the future it won't matter
// as we will know from the Drill version written in the files that the dates are correct
int rowGroupIndex = 0;
Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
findDateColWithStatsLoop : for (SchemaPath schemaPath : columns) {
List<ColumnDescriptor> parquetColumns = footer.getFileMetaData().getSchema().getColumns();
for (int i = 0; i < parquetColumns.size(); ++i) {
ColumnDescriptor column = parquetColumns.get(i);
// this reader only supports flat data, this is restricted in the ParquetScanBatchCreator
// creating a NameSegment makes sure we are using the standard code for comparing names,
// currently it is all case-insensitive
if (ColumnUtils.isStarQuery(columns) || new PathSegment.NameSegment(column.getPath()[0]).equals(schemaPath.getRootSegment())) {
int colIndex = -1;
ConvertedType convertedType = schemaElements.get(column.getPath()[0]).getConverted_type();
if (convertedType != null && convertedType.equals(ConvertedType.DATE)) {
List<ColumnChunkMetaData> colChunkList = footer.getBlocks().get(rowGroupIndex).getColumns();
for (int j = 0; j < colChunkList.size(); j++) {
if (colChunkList.get(j).getPath().equals(ColumnPath.get(column.getPath()))) {
colIndex = j;
break;
}
}
}
if (colIndex == -1) {
// column does not appear in this file, skip it
continue;
}
Statistics statistics = footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics();
Integer max = (Integer) statistics.genericGetMax();
if (statistics.hasNonNullValue()) {
if (max > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
return DateCorruptionStatus.META_SHOWS_CORRUPTION;
}
} else {
// no statistics, go check the first page
return DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
}
}
}
}
return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
}
示例15
public static NullableColumnReader<?> getNullableColumnReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize,
ColumnDescriptor columnDescriptor,
ColumnChunkMetaData columnChunkMetaData,
boolean fixedLength,
ValueVector valueVec,
SchemaElement schemaElement) throws ExecutionSetupException {
ConvertedType convertedType = schemaElement.getConverted_type();
if (! columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
if (columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.INT96) {
// TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation.
if (parentReader.readInt96AsTimeStamp()) {
return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (TimeStampMilliVector) valueVec, schemaElement);
} else {
return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (VarBinaryVector) valueVec, schemaElement);
}
}else{
return new NullableFixedByteAlignedReaders.NullableFixedByteAlignedReader<>(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, valueVec, schemaElement);
}
} else {
switch (columnDescriptor.getType()) {
case INT32:
if (convertedType == null) {
return new NullableFixedByteAlignedReaders.NullableDictionaryIntReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (IntVector) valueVec, schemaElement);
}
switch (convertedType) {
case DECIMAL:
return new NullableFixedByteAlignedReaders.NullableDictionaryDecimal9Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (DecimalVector) valueVec, schemaElement);
case TIME_MILLIS:
return new NullableFixedByteAlignedReaders.NullableDictionaryTimeReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (TimeMilliVector)valueVec, schemaElement);
default:
throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT32");
}
case INT64:
if (convertedType == null) {
return new NullableFixedByteAlignedReaders.NullableDictionaryBigIntReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (BigIntVector)valueVec, schemaElement);
}
switch (convertedType) {
case DECIMAL:
return new NullableFixedByteAlignedReaders.NullableDictionaryDecimal18Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (DecimalVector)valueVec, schemaElement);
case TIMESTAMP_MILLIS:
return new NullableFixedByteAlignedReaders.NullableDictionaryTimeStampReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (TimeStampMilliVector)valueVec, schemaElement);
default:
throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT64");
}
case INT96:
// TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation.
if (parentReader.readInt96AsTimeStamp()) {
return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (TimeStampMilliVector) valueVec, schemaElement);
} else {
return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (VarBinaryVector) valueVec, schemaElement);
}
case FLOAT:
return new NullableFixedByteAlignedReaders.NullableDictionaryFloat4Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (Float4Vector)valueVec, schemaElement);
case DOUBLE:
return new NullableFixedByteAlignedReaders.NullableDictionaryFloat8Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (Float8Vector)valueVec, schemaElement);
default:
throw new ExecutionSetupException("Unsupported nullable column type " + columnDescriptor.getType().name() );
}
}
}
示例16
private static TypeProtos.MinorType getMinorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length,
SchemaElement schemaElement, OptionManager options, Field arrowField,
final boolean readInt96AsTimeStamp) {
ConvertedType convertedType = schemaElement.getConverted_type();
switch (primitiveTypeName) {
case BINARY:
if (convertedType == null) {
return TypeProtos.MinorType.VARBINARY;
}
switch (convertedType) {
case UTF8:
return TypeProtos.MinorType.VARCHAR;
case DECIMAL:
ParquetReaderUtility.checkDecimalTypeEnabled(options);
return getDecimalType(schemaElement);
default:
return TypeProtos.MinorType.VARBINARY;
}
case INT64:
if (convertedType == null) {
return TypeProtos.MinorType.BIGINT;
}
switch(convertedType) {
case DECIMAL:
ParquetReaderUtility.checkDecimalTypeEnabled(options);
return TypeProtos.MinorType.DECIMAL;
// TODO - add this back if it is decided to be added upstream, was removed form our pull request July 2014
// case TIME_MICROS:
// throw new UnsupportedOperationException();
case TIMESTAMP_MILLIS:
return TypeProtos.MinorType.TIMESTAMP;
default:
throw new UnsupportedOperationException(String.format("unsupported type: %s %s", primitiveTypeName, convertedType));
}
case INT32:
if (convertedType == null) {
return TypeProtos.MinorType.INT;
}
switch(convertedType) {
case DECIMAL:
ParquetReaderUtility.checkDecimalTypeEnabled(options);
return TypeProtos.MinorType.DECIMAL;
case DATE:
return TypeProtos.MinorType.DATE;
case TIME_MILLIS:
return TypeProtos.MinorType.TIME;
default:
throw new UnsupportedOperationException(String.format("unsupported type: %s %s", primitiveTypeName, convertedType));
}
case BOOLEAN:
return TypeProtos.MinorType.BIT;
case FLOAT:
return TypeProtos.MinorType.FLOAT4;
case DOUBLE:
return TypeProtos.MinorType.FLOAT8;
// TODO - Both of these are not supported by the parquet library yet (7/3/13),
// but they are declared here for when they are implemented
case INT96:
if (readInt96AsTimeStamp) {
return TypeProtos.MinorType.TIMESTAMP;
} else {
return TypeProtos.MinorType.VARBINARY;
}
case FIXED_LEN_BYTE_ARRAY:
if (convertedType == null) {
checkArgument(length > 0, "A length greater than zero must be provided for a FixedBinary type.");
return TypeProtos.MinorType.VARBINARY;
} else if (convertedType == ConvertedType.DECIMAL) {
ParquetReaderUtility.checkDecimalTypeEnabled(options);
return getDecimalType(schemaElement);
} else if (convertedType == ConvertedType.INTERVAL) {
if (arrowField != null) {
if (arrowField.getType().getTypeID() == ArrowTypeID.Interval) {
switch (((Interval)arrowField.getType()).getUnit()) {
case DAY_TIME:
return TypeProtos.MinorType.INTERVALDAY;
case YEAR_MONTH:
return TypeProtos.MinorType.INTERVALYEAR;
}
}
throw new IllegalArgumentException("incompatible type " + arrowField);
}
// TODO: older versions of Drill generated this
return TypeProtos.MinorType.VARBINARY;
}
default:
throw new UnsupportedOperationException("Type not supported: " + primitiveTypeName);
}
}
示例17
ConvertedType convertToConvertedType(LogicalTypeAnnotation logicalTypeAnnotation) {
return logicalTypeAnnotation.accept(CONVERTED_TYPE_CONVERTER_VISITOR).orElse(null);
}
示例18
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
return of(ConvertedType.UTF8);
}
示例19
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) {
return of(ConvertedType.MAP);
}
示例20
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) {
return of(ConvertedType.LIST);
}
示例21
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) {
return of(ConvertedType.ENUM);
}
示例22
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
return of(ConvertedType.DECIMAL);
}
示例23
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) {
return of(ConvertedType.DATE);
}
示例24
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) {
return of(ConvertedType.JSON);
}
示例25
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) {
return of(ConvertedType.BSON);
}
示例26
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
return of(ConvertedType.INTERVAL);
}
示例27
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) {
return of(ConvertedType.MAP_KEY_VALUE);
}
示例28
LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement schemaElement) {
switch (type) {
case UTF8:
return LogicalTypeAnnotation.stringType();
case MAP:
return LogicalTypeAnnotation.mapType();
case MAP_KEY_VALUE:
return LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance();
case LIST:
return LogicalTypeAnnotation.listType();
case ENUM:
return LogicalTypeAnnotation.enumType();
case DECIMAL:
int scale = (schemaElement == null ? 0 : schemaElement.scale);
int precision = (schemaElement == null ? 0 : schemaElement.precision);
return LogicalTypeAnnotation.decimalType(scale, precision);
case DATE:
return LogicalTypeAnnotation.dateType();
case TIME_MILLIS:
return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MILLIS);
case TIME_MICROS:
return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS);
case TIMESTAMP_MILLIS:
return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS);
case TIMESTAMP_MICROS:
return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS);
case INTERVAL:
return LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance();
case INT_8:
return LogicalTypeAnnotation.intType(8, true);
case INT_16:
return LogicalTypeAnnotation.intType(16, true);
case INT_32:
return LogicalTypeAnnotation.intType(32, true);
case INT_64:
return LogicalTypeAnnotation.intType(64, true);
case UINT_8:
return LogicalTypeAnnotation.intType(8, false);
case UINT_16:
return LogicalTypeAnnotation.intType(16, false);
case UINT_32:
return LogicalTypeAnnotation.intType(32, false);
case UINT_64:
return LogicalTypeAnnotation.intType(64, false);
case JSON:
return LogicalTypeAnnotation.jsonType();
case BSON:
return LogicalTypeAnnotation.bsonType();
default:
throw new RuntimeException("Can't convert converted type to logical type, unknown converted type " + type);
}
}
示例29
private void buildChildren(Types.GroupBuilder builder,
Iterator<SchemaElement> schema,
int childrenCount,
List<ColumnOrder> columnOrders,
int columnCount) {
for (int i = 0; i < childrenCount; i++) {
SchemaElement schemaElement = schema.next();
// Create Parquet Type.
Types.Builder childBuilder;
if (schemaElement.type != null) {
Types.PrimitiveBuilder primitiveBuilder = builder.primitive(
getPrimitive(schemaElement.type),
fromParquetRepetition(schemaElement.repetition_type));
if (schemaElement.isSetType_length()) {
primitiveBuilder.length(schemaElement.type_length);
}
if (schemaElement.isSetPrecision()) {
primitiveBuilder.precision(schemaElement.precision);
}
if (schemaElement.isSetScale()) {
primitiveBuilder.scale(schemaElement.scale);
}
if (columnOrders != null) {
org.apache.parquet.schema.ColumnOrder columnOrder = fromParquetColumnOrder(columnOrders.get(columnCount));
// As per parquet format 2.4.0 no UNDEFINED order is supported. So, set undefined column order for the types
// where ordering is not supported.
if (columnOrder.getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER
&& (schemaElement.type == Type.INT96 || schemaElement.converted_type == ConvertedType.INTERVAL)) {
columnOrder = org.apache.parquet.schema.ColumnOrder.undefined();
}
primitiveBuilder.columnOrder(columnOrder);
}
childBuilder = primitiveBuilder;
} else {
childBuilder = builder.group(fromParquetRepetition(schemaElement.repetition_type));
buildChildren((Types.GroupBuilder) childBuilder, schema, schemaElement.num_children, columnOrders, columnCount);
}
if (schemaElement.isSetLogicalType()) {
childBuilder.as(getLogicalTypeAnnotation(schemaElement.logicalType));
}
if (schemaElement.isSetConverted_type()) {
OriginalType originalType = getLogicalTypeAnnotation(schemaElement.converted_type, schemaElement).toOriginalType();
OriginalType newOriginalType = (schemaElement.isSetLogicalType() && getLogicalTypeAnnotation(schemaElement.logicalType) != null) ?
getLogicalTypeAnnotation(schemaElement.logicalType).toOriginalType() : null;
if (!originalType.equals(newOriginalType)) {
if (newOriginalType != null) {
LOG.warn("Converted type and logical type metadata mismatch (convertedType: {}, logical type: {}). Using value in converted type.",
schemaElement.converted_type, schemaElement.logicalType);
}
childBuilder.as(originalType);
}
}
if (schemaElement.isSetField_id()) {
childBuilder.id(schemaElement.field_id);
}
childBuilder.named(schemaElement.name);
++columnCount;
}
}
示例30
@Test
public void testLogicalToConvertedTypeConversion() {
ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
assertEquals(ConvertedType.UTF8, parquetMetadataConverter.convertToConvertedType(stringType()));
assertEquals(ConvertedType.ENUM, parquetMetadataConverter.convertToConvertedType(enumType()));
assertEquals(ConvertedType.INT_8, parquetMetadataConverter.convertToConvertedType(intType(8, true)));
assertEquals(ConvertedType.INT_16, parquetMetadataConverter.convertToConvertedType(intType(16, true)));
assertEquals(ConvertedType.INT_32, parquetMetadataConverter.convertToConvertedType(intType(32, true)));
assertEquals(ConvertedType.INT_64, parquetMetadataConverter.convertToConvertedType(intType(64, true)));
assertEquals(ConvertedType.UINT_8, parquetMetadataConverter.convertToConvertedType(intType(8, false)));
assertEquals(ConvertedType.UINT_16, parquetMetadataConverter.convertToConvertedType(intType(16, false)));
assertEquals(ConvertedType.UINT_32, parquetMetadataConverter.convertToConvertedType(intType(32, false)));
assertEquals(ConvertedType.UINT_64, parquetMetadataConverter.convertToConvertedType(intType(64, false)));
assertEquals(ConvertedType.DECIMAL, parquetMetadataConverter.convertToConvertedType(decimalType(8, 16)));
assertEquals(ConvertedType.TIMESTAMP_MILLIS, parquetMetadataConverter.convertToConvertedType(timestampType(true, MILLIS)));
assertEquals(ConvertedType.TIMESTAMP_MICROS, parquetMetadataConverter.convertToConvertedType(timestampType(true, MICROS)));
assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(true, NANOS)));
assertEquals(ConvertedType.TIMESTAMP_MILLIS, parquetMetadataConverter.convertToConvertedType(timestampType(false, MILLIS)));
assertEquals(ConvertedType.TIMESTAMP_MICROS, parquetMetadataConverter.convertToConvertedType(timestampType(false, MICROS)));
assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(false, NANOS)));
assertEquals(ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(true, MILLIS)));
assertEquals(ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(true, MICROS)));
assertNull(parquetMetadataConverter.convertToConvertedType(timeType(true, NANOS)));
assertEquals(ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(false, MILLIS)));
assertEquals(ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(false, MICROS)));
assertNull(parquetMetadataConverter.convertToConvertedType(timeType(false, NANOS)));
assertEquals(ConvertedType.DATE, parquetMetadataConverter.convertToConvertedType(dateType()));
assertEquals(ConvertedType.INTERVAL, parquetMetadataConverter.convertToConvertedType(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance()));
assertEquals(ConvertedType.JSON, parquetMetadataConverter.convertToConvertedType(jsonType()));
assertEquals(ConvertedType.BSON, parquetMetadataConverter.convertToConvertedType(bsonType()));
assertNull(parquetMetadataConverter.convertToConvertedType(uuidType()));
assertEquals(ConvertedType.LIST, parquetMetadataConverter.convertToConvertedType(listType()));
assertEquals(ConvertedType.MAP, parquetMetadataConverter.convertToConvertedType(mapType()));
assertEquals(ConvertedType.MAP_KEY_VALUE, parquetMetadataConverter.convertToConvertedType(LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance()));
}