All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.io.Avro Maven / Gradle / Ivy

There is a newer version: 4.2.0
Show newest version
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.io;

import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.util.Utf8;
import smile.data.DataFrame;
import smile.data.Tuple;
import smile.data.measure.Measure;
import smile.data.measure.NominalScale;
import smile.data.type.DataType;
import smile.data.type.DataTypes;
import smile.data.type.StructField;
import smile.data.type.StructType;

/**
 * Apache Avro is a data serialization system.
 * 

* Avro provides rich data structures, a compact, fast, binary data format, * a container file, to store persistent data, and remote procedure call (RPC). *

* Avro relies on schemas. When Avro data is stored in a file, its schema * is stored with it. Avro schemas are defined with JSON. * * @author Haifeng Li */ public class Avro { /** * Avro schema. */ private final Schema schema; /** * Constructor. * * @param schema the data schema. */ public Avro(Schema schema) { if (schema.getType() != Schema.Type.RECORD) { throw new IllegalArgumentException("The type of schema is not Record"); } this.schema = schema; } /** * Constructor. * * @param schema the input stream of schema. * @throws IOException when fails to read the file. */ public Avro(InputStream schema) throws IOException { this(new Schema.Parser().parse(schema)); } /** * Constructor. * * @param schema the path to Avro schema file. * @throws IOException when fails to read the file. */ public Avro(Path schema) throws IOException { this(Files.newInputStream(schema)); } /** * Reads an avro file. * * @param path the input file path. * @throws IOException when fails to read the file. * @return the data frame. */ public DataFrame read(Path path) throws IOException { return read(Files.newInputStream(path), Integer.MAX_VALUE); } /** * Reads an avro file. * * @param path the input file path. * @throws IOException when fails to read the file. * @throws URISyntaxException when the file path syntax is wrong. * @return the data frame. */ public DataFrame read(String path) throws IOException, URISyntaxException { return read(HadoopInput.stream(path), Integer.MAX_VALUE); } /** * Reads a limited number of records from an avro file. * * @param input the input stream of data file. * @param limit the number of records to read. * @throws IOException when fails to read the file. * @return the data frame. */ public DataFrame read(InputStream input, int limit) throws IOException { DatumReader datumReader = new GenericDatumReader<>(schema); try (DataFileStream dataFileReader = new DataFileStream<>(input, datumReader)) { StructType struct = toSmileSchema(schema); List rows = new ArrayList<>(); GenericRecord record = null; while (dataFileReader.hasNext() && rows.size() < limit) { // Reuse the record to save memory record = dataFileReader.next(record); Object[] row = new Object[struct.length()]; for (int i = 0; i < row.length; i++) { row[i] = record.get(struct.field(i).name); if (row[i] instanceof Utf8) { String str = row[i].toString(); Measure measure = struct.field(i).measure; row[i] = measure != null ? measure.valueOf(str) : str; } } rows.add(Tuple.of(row, struct)); } return DataFrame.of(rows); } } /** Converts an arrow schema to smile schema. */ private StructType toSmileSchema(Schema schema) { List fields = new ArrayList<>(); for (Schema.Field field : schema.getFields()) { NominalScale scale = null; if (field.schema().getType() == Schema.Type.ENUM) { scale = new NominalScale(field.schema().getEnumSymbols()); } fields.add(new StructField(field.name(), typeOf(field.schema()), scale)); } return DataTypes.struct(fields); } /** Converts an avro type to smile type. */ private DataType typeOf(Schema schema) { return switch (schema.getType()) { case BOOLEAN -> DataTypes.BooleanType; case INT -> DataTypes.IntegerType; case LONG -> DataTypes.LongType; case FLOAT -> DataTypes.FloatType; case DOUBLE -> DataTypes.DoubleType; case STRING -> DataTypes.StringType; case FIXED, BYTES -> DataTypes.ByteArrayType; case ENUM -> new NominalScale(schema.getEnumSymbols()).type(); case ARRAY -> DataTypes.array(typeOf(schema.getElementType())); case MAP -> DataTypes.object(java.util.Map.class); case UNION -> unionType(schema); default -> throw new UnsupportedOperationException("Unsupported Avro type: " + schema); }; } /** Converts a Union type. */ private DataType unionType(Schema schema) { List union = schema.getTypes(); if (union.isEmpty()) { throw new IllegalArgumentException("Empty type list of Union"); } if (union.size() > 2) { String s = union.stream().map(Schema::getType).map(Object::toString).collect(Collectors.joining(", ")); throw new UnsupportedOperationException(String.format("Unsupported type Union(%s)", s)); } if (union.size() == 1) { return typeOf(union.getFirst()); } Schema a = union.get(0); Schema b = union.get(1); if (a.getType() == Schema.Type.NULL && b.getType() != Schema.Type.NULL) { return typeOf(b).boxed(); } if (a.getType() != Schema.Type.NULL && b.getType() == Schema.Type.NULL) { return typeOf(a).boxed(); } return DataTypes.object(Object.class); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy