All Downloads are FREE. Search and download functionalities are using the official Maven repository.

water.parser.parquet.ParquetParserProvider Maven / Gradle / Ivy

package water.parser.parquet;

import org.apache.parquet.format.converter.ParquetMetadataConverter;
import water.DKV;
import water.Job;
import water.Key;
import water.fvec.ByteVec;
import water.fvec.Frame;
import water.fvec.Vec;
import water.parser.*;

import static water.parser.parquet.ParquetParser.extractColumnTypes;

/**
 * Parquet parser provider.
 */
public class ParquetParserProvider extends BinaryParserProvider {

  /* Setup for this parser */
  static ParserInfo PARQUET_INFO = new ParserInfo("PARQUET", DefaultParserProviders.MAX_CORE_PRIO + 20, true, false, true,false);

  @Override
  public ParserInfo info() {
    return PARQUET_INFO;
  }

  @Override
  public Parser createParser(ParseSetup setup, Key jobKey) {
    return new ParquetParser(setup, jobKey);
  }

  @Override
  public ParseSetup guessInitSetup(ByteVec v, byte[] bits, ParseSetup userSetup) {
    return ParquetParser.guessFormatSetup(v, bits);
  }

  @Override
  public ParseSetup guessFinalSetup(ByteVec v, byte[] bits, ParseSetup ps) {
    boolean[] keepColumns=null;
    int[] parseColumnIndices = ps.get_parse_columns_indices();
    if (parseColumnIndices!= null) {
      int numCols = ps.getNumberColumns();
      keepColumns = new boolean[numCols];
      for (int cindex:parseColumnIndices) {
        keepColumns[cindex]=true;
      }
    }
    return ParquetParser.guessDataSetup(v, (ParquetParser.ParquetParseSetup) ps, keepColumns);
  }

  @Override
  public ParseSetup createParserSetup(Key[] inputs, ParseSetup requestedSetup) {
    // convert to an instance of ParquetParseSetup if needed
    ParseSetup setup = requestedSetup instanceof ParquetParser.ParquetParseSetup ?
            requestedSetup : requestedSetup.copyTo(new ParquetParser.ParquetParseSetup());
    // override incorrect type mappings (using the MessageFormat of the first file)
    Object frameOrVec = DKV.getGet(inputs[0]);
    ByteVec vec = (ByteVec) (frameOrVec instanceof Frame ? ((Frame) frameOrVec).vec(0) : frameOrVec);
    if (setup.getForceColTypes() && vec != null)
      setup.setParquetColumnTypes(extractColumnTypes(VecParquetReader.readFooter(VecParquetReader.readFooterAsBytes(vec), ParquetMetadataConverter.NO_FILTER)));
    byte[] requestedTypes = setup.getColumnTypes();
    byte[] types = ParquetParser.correctTypeConversions(vec, requestedTypes);
    setup.setColumnTypes(types);
    for (int i = 0; i < types.length; i++)
      if (types[i] != requestedTypes[i])
        setup.addErrs(new ParseWriter.UnsupportedTypeOverride(inputs[0].toString(),Vec.TYPE_STR[types[i]], Vec.TYPE_STR[requestedTypes[i]], setup.getColumnNames()[i]));
    return setup;
  }

  @Override
  public ParseSetup setupLocal(Vec v, ParseSetup setup) {
    ((ParquetParser.ParquetParseSetup) setup).parquetMetadata = VecParquetReader.readFooterAsBytes(v);
    return setup;
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy