Java源码示例:org.apache.parquet.format.ConvertedType

示例1
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) {
  boolean signed = intLogicalType.isSigned();
  switch (intLogicalType.getBitWidth()) {
    case 8:
      return of(signed ? ConvertedType.INT_8 : ConvertedType.UINT_8);
    case 16:
      return of(signed ? ConvertedType.INT_16 : ConvertedType.UINT_16);
    case 32:
      return of(signed ? ConvertedType.INT_32 : ConvertedType.UINT_32);
    case 64:
      return of(signed ? ConvertedType.INT_64 : ConvertedType.UINT_64);
    default:
      throw new RuntimeException("Unknown original type " + intLogicalType.toOriginalType());
  }
}
 
示例2
@Test
public void testIncompatibleLogicalAndConvertedTypes() {
  ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
  MessageType schema = Types.buildMessage()
    .required(PrimitiveTypeName.BINARY)
    .as(OriginalType.DECIMAL).precision(9).scale(2)
    .named("aBinary")
    .named("Message");
  MessageType expected = Types.buildMessage()
    .required(PrimitiveTypeName.BINARY)
    .as(LogicalTypeAnnotation.jsonType())
    .named("aBinary")
    .named("Message");

  List<SchemaElement> parquetSchema = parquetMetadataConverter.toParquetSchema(schema);
  // Set converted type field to a different type to verify that in case of mismatch, it overrides logical type
  parquetSchema.get(1).setConverted_type(ConvertedType.JSON);
  MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
  assertEquals(expected, actual);
}
 
示例3
static VarLengthValuesColumn<?> getReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                                        ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v,
                                        SchemaElement schemaElement
) throws ExecutionSetupException {
  ConvertedType convertedType = schemaElement.getConverted_type();
  switch (descriptor.getMaxDefinitionLevel()) {
    case 0:
      if (convertedType == null) {
        return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
      switch (convertedType) {
        case UTF8:
        case ENUM:
          return new VarLengthColumnReaders.VarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
        case DECIMAL:
          if (v instanceof VarDecimalVector) {
            return new VarLengthColumnReaders.VarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarDecimalVector) v, schemaElement);
          }
        default:
          return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
    default:
      if (convertedType == null) {
        return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
      }

      switch (convertedType) {
        case UTF8:
        case ENUM:
          return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarCharVector) v, schemaElement);
        case DECIMAL:
          if (v instanceof NullableVarDecimalVector) {
            return new VarLengthColumnReaders.NullableVarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) v, schemaElement);
          }
        default:
          return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
      }
  }
}
 
示例4
public static TypeProtos.MajorType toMajorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length,
                                        TypeProtos.DataMode mode, SchemaElement schemaElement,
                                        OptionManager options) {
  ConvertedType convertedType = schemaElement.getConverted_type();
  MinorType minorType = getMinorType(primitiveTypeName, length, convertedType, options);
  TypeProtos.MajorType.Builder typeBuilder = TypeProtos.MajorType.newBuilder().setMinorType(minorType).setMode(mode);

  if (Types.isDecimalType(minorType)) {
    int precision = schemaElement.getPrecision();
    int scale = schemaElement.getScale();

    typeBuilder.setPrecision(precision).setScale(scale);
  }
  return typeBuilder.build();
}
 
示例5
static VarLengthValuesColumn<?> getReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
                                          ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v,
                                          SchemaElement schemaElement
) throws ExecutionSetupException {
  ConvertedType convertedType = schemaElement.getConverted_type();
  switch (descriptor.getMaxDefinitionLevel()) {
    case 0:
      if (convertedType == null) {
        return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
      switch (convertedType) {
        case UTF8:
          return new VarLengthColumnReaders.VarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
        case DECIMAL:
          return new VarLengthColumnReaders.Decimal28Column(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement);
        default:
          return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
    default:
      if (convertedType == null) {
        return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }

      switch (convertedType) {
        case UTF8:
          return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
        case DECIMAL:
          return new NullableDecimalColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement);
        default:
          return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
  }
}
 
示例6
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) {
  switch (timeLogicalType.getUnit()) {
    case MILLIS:
      return of(ConvertedType.TIME_MILLIS);
    case MICROS:
      return of(ConvertedType.TIME_MICROS);
    case NANOS:
      return empty();
    default:
      throw new RuntimeException("Unknown converted type for " + timeLogicalType.toOriginalType());
  }
}
 
示例7
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) {
  switch (timestampLogicalType.getUnit()) {
    case MICROS:
      return of(ConvertedType.TIMESTAMP_MICROS);
    case MILLIS:
      return of(ConvertedType.TIMESTAMP_MILLIS);
    case NANOS:
      return empty();
    default:
      throw new RuntimeException("Unknown converted type for " + timestampLogicalType.toOriginalType());
  }
}
 
示例8
@Test
public void testSchemaConverterDecimal() {
  ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
  List<SchemaElement> schemaElements = parquetMetadataConverter.toParquetSchema(
      Types.buildMessage()
          .required(PrimitiveTypeName.BINARY)
              .as(OriginalType.DECIMAL).precision(9).scale(2)
              .named("aBinaryDecimal")
          .optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(4)
              .as(OriginalType.DECIMAL).precision(9).scale(2)
              .named("aFixedDecimal")
          .named("Message")
  );
  List<SchemaElement> expected = Lists.newArrayList(
      new SchemaElement("Message").setNum_children(2),
      new SchemaElement("aBinaryDecimal")
          .setRepetition_type(FieldRepetitionType.REQUIRED)
          .setType(Type.BYTE_ARRAY)
          .setConverted_type(ConvertedType.DECIMAL)
          .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9)))
          .setPrecision(9).setScale(2),
      new SchemaElement("aFixedDecimal")
          .setRepetition_type(FieldRepetitionType.OPTIONAL)
          .setType(Type.FIXED_LEN_BYTE_ARRAY)
          .setType_length(4)
          .setConverted_type(ConvertedType.DECIMAL)
          .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9)))
          .setPrecision(9).setScale(2)
  );
  Assert.assertEquals(expected, schemaElements);
}
 
示例9
/**
 * Detect corrupt date values by looking at the min/max values in the metadata.
 *
 * This should only be used when a file does not have enough metadata to determine if
 * the data was written with an external tool or an older version of Drill
 * ({@link org.apache.drill.exec.store.parquet.ParquetRecordWriter#WRITER_VERSION_PROPERTY} <
 * {@link org.apache.drill.exec.store.parquet.ParquetReaderUtility#DRILL_WRITER_VERSION_STD_DATE_FORMAT})
 *
 * This method only checks the first Row Group, because Drill has only ever written
 * a single Row Group per file.
 *
 * @param footer parquet footer
 * @param columns list of columns schema path
 * @param autoCorrectCorruptDates user setting to allow enabling/disabling of auto-correction
 *                                of corrupt dates. There are some rare cases (storing dates thousands
 *                                of years into the future, with tools other than Drill writing files)
 *                                that would result in the date values being "corrected" into bad values.
 */
public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(ParquetMetadata footer,
                                                            List<SchemaPath> columns,
                                                            boolean autoCorrectCorruptDates) {
  // Users can turn-off date correction in cases where we are detecting corruption based on the date values
  // that are unlikely to appear in common datasets. In this case report that no correction needs to happen
  // during the file read
  if (! autoCorrectCorruptDates) {
    return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
  }
  // Drill produced files have only ever have a single row group, if this changes in the future it won't matter
  // as we will know from the Drill version written in the files that the dates are correct
  int rowGroupIndex = 0;
  Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
  findDateColWithStatsLoop : for (SchemaPath schemaPath : columns) {
    List<ColumnDescriptor> parquetColumns = footer.getFileMetaData().getSchema().getColumns();
    for (int i = 0; i < parquetColumns.size(); ++i) {
      ColumnDescriptor column = parquetColumns.get(i);
      // this reader only supports flat data, this is restricted in the ParquetScanBatchCreator
      // creating a NameSegment makes sure we are using the standard code for comparing names,
      // currently it is all case-insensitive
      if (Utilities.isStarQuery(columns)
          || getFullColumnPath(column).equalsIgnoreCase(schemaPath.getUnIndexed().toString())) {
        int colIndex = -1;
        ConvertedType convertedType = schemaElements.get(getFullColumnPath(column)).getConverted_type();
        if (convertedType != null && convertedType.equals(ConvertedType.DATE)) {
          List<ColumnChunkMetaData> colChunkList = footer.getBlocks().get(rowGroupIndex).getColumns();
          for (int j = 0; j < colChunkList.size(); j++) {
            if (colChunkList.get(j).getPath().equals(ColumnPath.get(column.getPath()))) {
              colIndex = j;
              break;
            }
          }
        }
        if (colIndex == -1) {
          // column does not appear in this file, skip it
          continue;
        }
        IntStatistics statistics = (IntStatistics) footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics();
        return (statistics.hasNonNullValue() && statistics.compareMaxToValue(ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) > 0) ?
            DateCorruptionStatus.META_SHOWS_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
      }
    }
  }
  return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
}
 
示例10
public static NullableColumnReader<?> getNullableColumnReader(ParquetRecordReader parentReader,
                                                           ColumnDescriptor columnDescriptor,
                                                           ColumnChunkMetaData columnChunkMetaData,
                                                           boolean fixedLength,
                                                           ValueVector valueVec,
                                                           SchemaElement schemaElement) throws ExecutionSetupException {
  ConvertedType convertedType = schemaElement.getConverted_type();

  if (! columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
    if (columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.INT96) {
       // TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation.
      if (parentReader.getFragmentContext().getOptions().getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) {
        return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableTimeStampVector) valueVec, schemaElement);
      } else {
        return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableVarBinaryVector) valueVec, schemaElement);
      }
    } else if (convertedType == ConvertedType.DECIMAL) {
      // NullableVarDecimalVector allows storing of values with different width,
      // so every time when the value is added, offset vector should be updated.
      // Therefore NullableVarDecimalReader is used here instead of NullableFixedByteAlignedReader.
      return new NullableFixedByteAlignedReaders.NullableVarDecimalReader(parentReader,
          columnDescriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) valueVec, schemaElement);
    } else {
      return new NullableFixedByteAlignedReaders.NullableFixedByteAlignedReader<>(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, valueVec, schemaElement);
    }
  } else {
    switch (columnDescriptor.getType()) {
      case INT32:
        if (convertedType == null) {
          return new NullableFixedByteAlignedReaders.NullableDictionaryIntReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableIntVector) valueVec, schemaElement);
        }
        switch (convertedType) {
          case DECIMAL:
            return new NullableFixedByteAlignedReaders.NullableDictionaryVarDecimalReader(parentReader,
                columnDescriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) valueVec, schemaElement);
          case TIME_MILLIS:
            return new NullableFixedByteAlignedReaders.NullableDictionaryTimeReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableTimeVector)valueVec, schemaElement);
          default:
            throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT32");
        }
      case INT64:
        if (convertedType == null) {
          return new NullableFixedByteAlignedReaders.NullableDictionaryBigIntReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableBigIntVector)valueVec, schemaElement);
        }
        switch (convertedType) {
          case DECIMAL:
            return new NullableFixedByteAlignedReaders.NullableDictionaryVarDecimalReader(parentReader,
                columnDescriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) valueVec, schemaElement);
          case TIMESTAMP_MILLIS:
            return new NullableFixedByteAlignedReaders.NullableDictionaryTimeStampReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableTimeStampVector)valueVec, schemaElement);
          // DRILL-6670: handle TIMESTAMP_MICROS as INT64 with no logical type
          case TIMESTAMP_MICROS:
            return new NullableFixedByteAlignedReaders.NullableDictionaryBigIntReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableBigIntVector)valueVec, schemaElement);
          default:
            throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT64");
        }
      case INT96:
        // TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation.
        if (parentReader.getFragmentContext().getOptions().getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) {
          return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableTimeStampVector) valueVec, schemaElement);
        } else {
          return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableVarBinaryVector) valueVec, schemaElement);
        }
      case FLOAT:
        return new NullableFixedByteAlignedReaders.NullableDictionaryFloat4Reader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableFloat4Vector)valueVec, schemaElement);
      case DOUBLE:
        return new NullableFixedByteAlignedReaders.NullableDictionaryFloat8Reader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableFloat8Vector)valueVec, schemaElement);
      default:
        throw new ExecutionSetupException("Unsupported nullable column type " + columnDescriptor.getType().name() );
    }
  }
}
 
示例11
private static TypeProtos.MinorType getMinorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length,
    ConvertedType convertedType, OptionManager options) {
  switch (primitiveTypeName) {
    case BINARY:
      if (convertedType == null) {
        return (TypeProtos.MinorType.VARBINARY);
      }
      switch (convertedType) {
        case UTF8:
        case ENUM:
          return (TypeProtos.MinorType.VARCHAR);
        case DECIMAL:
          ParquetReaderUtility.checkDecimalTypeEnabled(options);
          return TypeProtos.MinorType.VARDECIMAL;
        default:
          return (TypeProtos.MinorType.VARBINARY);
      }
    case INT64:
      if (convertedType == null) {
        return (TypeProtos.MinorType.BIGINT);
      }
      switch(convertedType) {
        // DRILL-6670: handle TIMESTAMP_MICROS as INT64 with no logical type
        case INT_64:
        case TIMESTAMP_MICROS:
          return TypeProtos.MinorType.BIGINT;
        case UINT_64:
          return TypeProtos.MinorType.UINT8;
        case DECIMAL:
          ParquetReaderUtility.checkDecimalTypeEnabled(options);
          return TypeProtos.MinorType.VARDECIMAL;
        case TIMESTAMP_MILLIS:
          return TypeProtos.MinorType.TIMESTAMP;
        default:
          throw new UnsupportedOperationException(String.format("unsupported type: %s %s", primitiveTypeName, convertedType));
      }
    case INT32:
      if (convertedType == null) {
        return TypeProtos.MinorType.INT;
      }
      switch(convertedType) {
        case UINT_8:
        case UINT_16:
        case UINT_32:
          return TypeProtos.MinorType.UINT4;
        case INT_8:
        case INT_16:
        case INT_32:
          return TypeProtos.MinorType.INT;
        case DECIMAL:
          ParquetReaderUtility.checkDecimalTypeEnabled(options);
          return TypeProtos.MinorType.VARDECIMAL;
        case DATE:
          return TypeProtos.MinorType.DATE;
        case TIME_MILLIS:
          return TypeProtos.MinorType.TIME;
        default:
          throw new UnsupportedOperationException(String.format("unsupported type: %s %s", primitiveTypeName, convertedType));
      }
    case BOOLEAN:
      return TypeProtos.MinorType.BIT;
    case FLOAT:
      return TypeProtos.MinorType.FLOAT4;
    case DOUBLE:
      return TypeProtos.MinorType.FLOAT8;
    // TODO - Both of these are not supported by the parquet library yet (7/3/13),
    // but they are declared here for when they are implemented
    case INT96:
      if (options.getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) {
        return TypeProtos.MinorType.TIMESTAMP;
      } else {
        return TypeProtos.MinorType.VARBINARY;
      }
    case FIXED_LEN_BYTE_ARRAY:
      if (convertedType == null) {
        checkArgument(length > 0, "A length greater than zero must be provided for a FixedBinary type.");
        return TypeProtos.MinorType.VARBINARY;
      } else if (convertedType == ConvertedType.DECIMAL) {
        ParquetReaderUtility.checkDecimalTypeEnabled(options);
        return TypeProtos.MinorType.VARDECIMAL;
      } else if (convertedType == ConvertedType.INTERVAL) {
        return TypeProtos.MinorType.INTERVAL;
      }
    default:
      throw new UnsupportedOperationException("Type not supported: " + primitiveTypeName);
  }
}
 
示例12
private static ConvertedType getConvertedType(OriginalType type)
{
    switch (type) {
        case UTF8:
            return ConvertedType.UTF8;
        case MAP:
            return ConvertedType.MAP;
        case MAP_KEY_VALUE:
            return ConvertedType.MAP_KEY_VALUE;
        case LIST:
            return ConvertedType.LIST;
        case ENUM:
            return ConvertedType.ENUM;
        case DECIMAL:
            return ConvertedType.DECIMAL;
        case DATE:
            return ConvertedType.DATE;
        case TIME_MILLIS:
            return ConvertedType.TIME_MILLIS;
        case TIMESTAMP_MILLIS:
            return ConvertedType.TIMESTAMP_MILLIS;
        case INTERVAL:
            return ConvertedType.INTERVAL;
        case INT_8:
            return ConvertedType.INT_8;
        case INT_16:
            return ConvertedType.INT_16;
        case INT_32:
            return ConvertedType.INT_32;
        case INT_64:
            return ConvertedType.INT_64;
        case UINT_8:
            return ConvertedType.UINT_8;
        case UINT_16:
            return ConvertedType.UINT_16;
        case UINT_32:
            return ConvertedType.UINT_32;
        case UINT_64:
            return ConvertedType.UINT_64;
        case JSON:
            return ConvertedType.JSON;
        case BSON:
            return ConvertedType.BSON;
        default:
            throw new RuntimeException("Unknown original type " + type);
    }
}
 
示例13
private static OriginalType getOriginalType(ConvertedType type)
{
    switch (type) {
        case UTF8:
            return OriginalType.UTF8;
        case MAP:
            return OriginalType.MAP;
        case MAP_KEY_VALUE:
            return OriginalType.MAP_KEY_VALUE;
        case LIST:
            return OriginalType.LIST;
        case ENUM:
            return OriginalType.ENUM;
        case DECIMAL:
            return OriginalType.DECIMAL;
        case DATE:
            return OriginalType.DATE;
        case TIME_MILLIS:
            return OriginalType.TIME_MILLIS;
        case TIMESTAMP_MILLIS:
            return OriginalType.TIMESTAMP_MILLIS;
        case INTERVAL:
            return OriginalType.INTERVAL;
        case INT_8:
            return OriginalType.INT_8;
        case INT_16:
            return OriginalType.INT_16;
        case INT_32:
            return OriginalType.INT_32;
        case INT_64:
            return OriginalType.INT_64;
        case UINT_8:
            return OriginalType.UINT_8;
        case UINT_16:
            return OriginalType.UINT_16;
        case UINT_32:
            return OriginalType.UINT_32;
        case UINT_64:
            return OriginalType.UINT_64;
        case JSON:
            return OriginalType.JSON;
        case BSON:
            return OriginalType.BSON;
        case TIMESTAMP_MICROS:
            return OriginalType.TIMESTAMP_MICROS;
        case TIME_MICROS:
            return OriginalType.TIME_MICROS;
        default:
            throw new IllegalArgumentException("Unknown converted type " + type);
    }
}
 
示例14
/**
 * Detect corrupt date values by looking at the min/max values in the metadata.
 *
 * This should only be used when a file does not have enough metadata to determine if
 * the data was written with an older version of Drill, or an external tool. Drill
 * versions 1.3 and beyond should have enough metadata to confirm that the data was written
 * by Drill.
 *
 * This method only checks the first Row Group, because Drill has only ever written
 * a single Row Group per file.
 *
 * @param footer
 * @param columns
 * @param autoCorrectCorruptDates user setting to allow enabling/disabling of auto-correction
 *                                of corrupt dates. There are some rare cases (storing dates thousands
 *                                of years into the future, with tools other than Drill writing files)
 *                                that would result in the date values being "corrected" into bad values.
 */
public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(ParquetMetadata footer,
                                                            List<SchemaPath> columns,
                                                            boolean autoCorrectCorruptDates) {
  // Users can turn-off date correction in cases where we are detecting corruption based on the date values
  // that are unlikely to appear in common datasets. In this case report that no correction needs to happen
  // during the file read
  if (! autoCorrectCorruptDates) {
    return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
  }
  // Drill produced files have only ever have a single row group, if this changes in the future it won't matter
  // as we will know from the Drill version written in the files that the dates are correct
  int rowGroupIndex = 0;
  Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
  findDateColWithStatsLoop : for (SchemaPath schemaPath : columns) {
    List<ColumnDescriptor> parquetColumns = footer.getFileMetaData().getSchema().getColumns();
    for (int i = 0; i < parquetColumns.size(); ++i) {
      ColumnDescriptor column = parquetColumns.get(i);
      // this reader only supports flat data, this is restricted in the ParquetScanBatchCreator
      // creating a NameSegment makes sure we are using the standard code for comparing names,
      // currently it is all case-insensitive
      if (ColumnUtils.isStarQuery(columns) || new PathSegment.NameSegment(column.getPath()[0]).equals(schemaPath.getRootSegment())) {
        int colIndex = -1;
        ConvertedType convertedType = schemaElements.get(column.getPath()[0]).getConverted_type();
        if (convertedType != null && convertedType.equals(ConvertedType.DATE)) {
          List<ColumnChunkMetaData> colChunkList = footer.getBlocks().get(rowGroupIndex).getColumns();
          for (int j = 0; j < colChunkList.size(); j++) {
            if (colChunkList.get(j).getPath().equals(ColumnPath.get(column.getPath()))) {
              colIndex = j;
              break;
            }
          }
        }
        if (colIndex == -1) {
          // column does not appear in this file, skip it
          continue;
        }
        Statistics statistics = footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics();
        Integer max = (Integer) statistics.genericGetMax();
        if (statistics.hasNonNullValue()) {
          if (max > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
            return DateCorruptionStatus.META_SHOWS_CORRUPTION;
          }
        } else {
          // no statistics, go check the first page
          return DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
        }
      }
    }
  }
  return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
}
 
示例15
public static NullableColumnReader<?> getNullableColumnReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize,
                                                              ColumnDescriptor columnDescriptor,
                                                              ColumnChunkMetaData columnChunkMetaData,
                                                              boolean fixedLength,
                                                              ValueVector valueVec,
                                                              SchemaElement schemaElement) throws ExecutionSetupException {
  ConvertedType convertedType = schemaElement.getConverted_type();

  if (! columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
    if (columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.INT96) {
       // TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation.
      if (parentReader.readInt96AsTimeStamp()) {
        return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (TimeStampMilliVector) valueVec, schemaElement);
      } else {
        return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (VarBinaryVector) valueVec, schemaElement);
      }
    }else{
      return new NullableFixedByteAlignedReaders.NullableFixedByteAlignedReader<>(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, valueVec, schemaElement);
    }
  } else {
    switch (columnDescriptor.getType()) {
      case INT32:
        if (convertedType == null) {
          return new NullableFixedByteAlignedReaders.NullableDictionaryIntReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (IntVector) valueVec, schemaElement);
        }
        switch (convertedType) {
          case DECIMAL:
            return new NullableFixedByteAlignedReaders.NullableDictionaryDecimal9Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (DecimalVector) valueVec, schemaElement);
          case TIME_MILLIS:
            return new NullableFixedByteAlignedReaders.NullableDictionaryTimeReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (TimeMilliVector)valueVec, schemaElement);
          default:
            throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT32");
        }
      case INT64:
        if (convertedType == null) {
          return new NullableFixedByteAlignedReaders.NullableDictionaryBigIntReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (BigIntVector)valueVec, schemaElement);
        }
        switch (convertedType) {
          case DECIMAL:
            return new NullableFixedByteAlignedReaders.NullableDictionaryDecimal18Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (DecimalVector)valueVec, schemaElement);
          case TIMESTAMP_MILLIS:
            return new NullableFixedByteAlignedReaders.NullableDictionaryTimeStampReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (TimeStampMilliVector)valueVec, schemaElement);
          default:
            throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT64");
        }
      case INT96:
        // TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation.
        if (parentReader.readInt96AsTimeStamp()) {
          return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (TimeStampMilliVector) valueVec, schemaElement);
        } else {
          return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (VarBinaryVector) valueVec, schemaElement);
        }
      case FLOAT:
        return new NullableFixedByteAlignedReaders.NullableDictionaryFloat4Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (Float4Vector)valueVec, schemaElement);
      case DOUBLE:
        return new NullableFixedByteAlignedReaders.NullableDictionaryFloat8Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (Float8Vector)valueVec, schemaElement);
      default:
        throw new ExecutionSetupException("Unsupported nullable column type " + columnDescriptor.getType().name() );
    }
  }
}
 
示例16
private static TypeProtos.MinorType getMinorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length,
                                                   SchemaElement schemaElement, OptionManager options, Field arrowField,
                                                   final boolean readInt96AsTimeStamp) {

    ConvertedType convertedType = schemaElement.getConverted_type();

    switch (primitiveTypeName) {
      case BINARY:
        if (convertedType == null) {
          return TypeProtos.MinorType.VARBINARY;
        }
        switch (convertedType) {
          case UTF8:
            return TypeProtos.MinorType.VARCHAR;
          case DECIMAL:
            ParquetReaderUtility.checkDecimalTypeEnabled(options);
            return getDecimalType(schemaElement);
          default:
            return TypeProtos.MinorType.VARBINARY;
        }
      case INT64:
        if (convertedType == null) {
          return TypeProtos.MinorType.BIGINT;
        }
        switch(convertedType) {
          case DECIMAL:
            ParquetReaderUtility.checkDecimalTypeEnabled(options);
            return TypeProtos.MinorType.DECIMAL;
          // TODO - add this back if it is decided to be added upstream, was removed form our pull request July 2014
//              case TIME_MICROS:
//                throw new UnsupportedOperationException();
          case TIMESTAMP_MILLIS:
            return TypeProtos.MinorType.TIMESTAMP;
          default:
            throw new UnsupportedOperationException(String.format("unsupported type: %s %s", primitiveTypeName, convertedType));
        }
      case INT32:
        if (convertedType == null) {
          return TypeProtos.MinorType.INT;
        }
        switch(convertedType) {
          case DECIMAL:
            ParquetReaderUtility.checkDecimalTypeEnabled(options);
            return TypeProtos.MinorType.DECIMAL;
          case DATE:
            return TypeProtos.MinorType.DATE;
          case TIME_MILLIS:
            return TypeProtos.MinorType.TIME;
          default:
            throw new UnsupportedOperationException(String.format("unsupported type: %s %s", primitiveTypeName, convertedType));
        }
      case BOOLEAN:
        return TypeProtos.MinorType.BIT;
      case FLOAT:
        return TypeProtos.MinorType.FLOAT4;
      case DOUBLE:
        return TypeProtos.MinorType.FLOAT8;
      // TODO - Both of these are not supported by the parquet library yet (7/3/13),
      // but they are declared here for when they are implemented
      case INT96:
        if (readInt96AsTimeStamp) {
          return TypeProtos.MinorType.TIMESTAMP;
        } else {
          return TypeProtos.MinorType.VARBINARY;
        }
      case FIXED_LEN_BYTE_ARRAY:
        if (convertedType == null) {
          checkArgument(length > 0, "A length greater than zero must be provided for a FixedBinary type.");
          return TypeProtos.MinorType.VARBINARY;
        } else if (convertedType == ConvertedType.DECIMAL) {
          ParquetReaderUtility.checkDecimalTypeEnabled(options);
          return getDecimalType(schemaElement);
        } else if (convertedType == ConvertedType.INTERVAL) {
          if (arrowField != null) {
            if (arrowField.getType().getTypeID() == ArrowTypeID.Interval) {
              switch (((Interval)arrowField.getType()).getUnit()) {
                case DAY_TIME:
                  return TypeProtos.MinorType.INTERVALDAY;
                case YEAR_MONTH:
                  return TypeProtos.MinorType.INTERVALYEAR;
              }
            }
            throw new IllegalArgumentException("incompatible type " + arrowField);
          }
          // TODO: older versions of Drill generated this
          return TypeProtos.MinorType.VARBINARY;
        }
      default:
        throw new UnsupportedOperationException("Type not supported: " + primitiveTypeName);
    }
  }
 
示例17
ConvertedType convertToConvertedType(LogicalTypeAnnotation logicalTypeAnnotation) {
  return logicalTypeAnnotation.accept(CONVERTED_TYPE_CONVERTER_VISITOR).orElse(null);
}
 
示例18
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
  return of(ConvertedType.UTF8);
}
 
示例19
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) {
  return of(ConvertedType.MAP);
}
 
示例20
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) {
  return of(ConvertedType.LIST);
}
 
示例21
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) {
  return of(ConvertedType.ENUM);
}
 
示例22
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
  return of(ConvertedType.DECIMAL);
}
 
示例23
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) {
  return of(ConvertedType.DATE);
}
 
示例24
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) {
  return of(ConvertedType.JSON);
}
 
示例25
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) {
  return of(ConvertedType.BSON);
}
 
示例26
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
  return of(ConvertedType.INTERVAL);
}
 
示例27
@Override
public Optional<ConvertedType> visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) {
  return of(ConvertedType.MAP_KEY_VALUE);
}
 
示例28
LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement schemaElement) {
  switch (type) {
    case UTF8:
      return LogicalTypeAnnotation.stringType();
    case MAP:
      return LogicalTypeAnnotation.mapType();
    case MAP_KEY_VALUE:
      return LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance();
    case LIST:
      return LogicalTypeAnnotation.listType();
    case ENUM:
      return LogicalTypeAnnotation.enumType();
    case DECIMAL:
      int scale = (schemaElement == null ? 0 : schemaElement.scale);
      int precision = (schemaElement == null ? 0 : schemaElement.precision);
      return LogicalTypeAnnotation.decimalType(scale, precision);
    case DATE:
      return LogicalTypeAnnotation.dateType();
    case TIME_MILLIS:
      return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MILLIS);
    case TIME_MICROS:
      return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS);
    case TIMESTAMP_MILLIS:
      return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS);
    case TIMESTAMP_MICROS:
      return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS);
    case INTERVAL:
      return LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance();
    case INT_8:
      return LogicalTypeAnnotation.intType(8, true);
    case INT_16:
      return LogicalTypeAnnotation.intType(16, true);
    case INT_32:
      return LogicalTypeAnnotation.intType(32, true);
    case INT_64:
      return LogicalTypeAnnotation.intType(64, true);
    case UINT_8:
      return LogicalTypeAnnotation.intType(8, false);
    case UINT_16:
      return LogicalTypeAnnotation.intType(16, false);
    case UINT_32:
      return LogicalTypeAnnotation.intType(32, false);
    case UINT_64:
      return LogicalTypeAnnotation.intType(64, false);
    case JSON:
      return LogicalTypeAnnotation.jsonType();
    case BSON:
      return LogicalTypeAnnotation.bsonType();
    default:
      throw new RuntimeException("Can't convert converted type to logical type, unknown converted type " + type);
  }
}
 
示例29
private void buildChildren(Types.GroupBuilder builder,
                           Iterator<SchemaElement> schema,
                           int childrenCount,
                           List<ColumnOrder> columnOrders,
                           int columnCount) {
  for (int i = 0; i < childrenCount; i++) {
    SchemaElement schemaElement = schema.next();

    // Create Parquet Type.
    Types.Builder childBuilder;
    if (schemaElement.type != null) {
      Types.PrimitiveBuilder primitiveBuilder = builder.primitive(
          getPrimitive(schemaElement.type),
          fromParquetRepetition(schemaElement.repetition_type));
      if (schemaElement.isSetType_length()) {
        primitiveBuilder.length(schemaElement.type_length);
      }
      if (schemaElement.isSetPrecision()) {
        primitiveBuilder.precision(schemaElement.precision);
      }
      if (schemaElement.isSetScale()) {
        primitiveBuilder.scale(schemaElement.scale);
      }
      if (columnOrders != null) {
        org.apache.parquet.schema.ColumnOrder columnOrder = fromParquetColumnOrder(columnOrders.get(columnCount));
        // As per parquet format 2.4.0 no UNDEFINED order is supported. So, set undefined column order for the types
        // where ordering is not supported.
        if (columnOrder.getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER
            && (schemaElement.type == Type.INT96 || schemaElement.converted_type == ConvertedType.INTERVAL)) {
          columnOrder = org.apache.parquet.schema.ColumnOrder.undefined();
        }
        primitiveBuilder.columnOrder(columnOrder);
      }
      childBuilder = primitiveBuilder;

    } else {
      childBuilder = builder.group(fromParquetRepetition(schemaElement.repetition_type));
      buildChildren((Types.GroupBuilder) childBuilder, schema, schemaElement.num_children, columnOrders, columnCount);
    }

    if (schemaElement.isSetLogicalType()) {
      childBuilder.as(getLogicalTypeAnnotation(schemaElement.logicalType));
    }
    if (schemaElement.isSetConverted_type()) {
      OriginalType originalType = getLogicalTypeAnnotation(schemaElement.converted_type, schemaElement).toOriginalType();
      OriginalType newOriginalType = (schemaElement.isSetLogicalType() && getLogicalTypeAnnotation(schemaElement.logicalType) != null) ?
         getLogicalTypeAnnotation(schemaElement.logicalType).toOriginalType() : null;
      if (!originalType.equals(newOriginalType)) {
        if (newOriginalType != null) {
          LOG.warn("Converted type and logical type metadata mismatch (convertedType: {}, logical type: {}). Using value in converted type.",
            schemaElement.converted_type, schemaElement.logicalType);
        }
        childBuilder.as(originalType);
      }
    }
    if (schemaElement.isSetField_id()) {
      childBuilder.id(schemaElement.field_id);
    }

    childBuilder.named(schemaElement.name);
    ++columnCount;
  }
}
 
示例30
@Test
public void testLogicalToConvertedTypeConversion() {
  ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();

  assertEquals(ConvertedType.UTF8, parquetMetadataConverter.convertToConvertedType(stringType()));
  assertEquals(ConvertedType.ENUM, parquetMetadataConverter.convertToConvertedType(enumType()));

  assertEquals(ConvertedType.INT_8, parquetMetadataConverter.convertToConvertedType(intType(8, true)));
  assertEquals(ConvertedType.INT_16, parquetMetadataConverter.convertToConvertedType(intType(16, true)));
  assertEquals(ConvertedType.INT_32, parquetMetadataConverter.convertToConvertedType(intType(32, true)));
  assertEquals(ConvertedType.INT_64, parquetMetadataConverter.convertToConvertedType(intType(64, true)));
  assertEquals(ConvertedType.UINT_8, parquetMetadataConverter.convertToConvertedType(intType(8, false)));
  assertEquals(ConvertedType.UINT_16, parquetMetadataConverter.convertToConvertedType(intType(16, false)));
  assertEquals(ConvertedType.UINT_32, parquetMetadataConverter.convertToConvertedType(intType(32, false)));
  assertEquals(ConvertedType.UINT_64, parquetMetadataConverter.convertToConvertedType(intType(64, false)));
  assertEquals(ConvertedType.DECIMAL, parquetMetadataConverter.convertToConvertedType(decimalType(8, 16)));

  assertEquals(ConvertedType.TIMESTAMP_MILLIS, parquetMetadataConverter.convertToConvertedType(timestampType(true, MILLIS)));
  assertEquals(ConvertedType.TIMESTAMP_MICROS, parquetMetadataConverter.convertToConvertedType(timestampType(true, MICROS)));
  assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(true, NANOS)));
  assertEquals(ConvertedType.TIMESTAMP_MILLIS, parquetMetadataConverter.convertToConvertedType(timestampType(false, MILLIS)));
  assertEquals(ConvertedType.TIMESTAMP_MICROS, parquetMetadataConverter.convertToConvertedType(timestampType(false, MICROS)));
  assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(false, NANOS)));

  assertEquals(ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(true, MILLIS)));
  assertEquals(ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(true, MICROS)));
  assertNull(parquetMetadataConverter.convertToConvertedType(timeType(true, NANOS)));
  assertEquals(ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(false, MILLIS)));
  assertEquals(ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(false, MICROS)));
  assertNull(parquetMetadataConverter.convertToConvertedType(timeType(false, NANOS)));

  assertEquals(ConvertedType.DATE, parquetMetadataConverter.convertToConvertedType(dateType()));

  assertEquals(ConvertedType.INTERVAL, parquetMetadataConverter.convertToConvertedType(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance()));
  assertEquals(ConvertedType.JSON, parquetMetadataConverter.convertToConvertedType(jsonType()));
  assertEquals(ConvertedType.BSON, parquetMetadataConverter.convertToConvertedType(bsonType()));

  assertNull(parquetMetadataConverter.convertToConvertedType(uuidType()));

  assertEquals(ConvertedType.LIST, parquetMetadataConverter.convertToConvertedType(listType()));
  assertEquals(ConvertedType.MAP, parquetMetadataConverter.convertToConvertedType(mapType()));
  assertEquals(ConvertedType.MAP_KEY_VALUE, parquetMetadataConverter.convertToConvertedType(LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance()));
}