All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.senzing.io.RecordReader Maven / Gradle / Ivy

The newest version!
package com.senzing.io;

import com.senzing.util.JsonUtilities;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;

import javax.json.*;
import javax.json.stream.JsonParser;
import javax.json.stream.JsonParserFactory;
import javax.json.stream.JsonParsingException;
import java.io.*;
import java.util.*;

/**
 * Provides a reader over records that are formatted as JSON, JSON-Lines
 * or CSV.
 */
public class RecordReader {
  /**
   * Represents the supported format for the records.
   */
  public enum Format {
    JSON("application/json", "JSON"),
    JSON_LINES("application/x-jsonlines", "JSON Lines"),
    CSV("text/csv", "CSV");

    /**
     *  The associated media type.
     */
    private String mediaType;

    /**
     * The simple name associated with the media type.
     */
    private String simpleName;

    /**
     * The lookup map to lookup format by media type.
     */
    private static Map MEDIA_TYPE_LOOKUP;

    /**
     * Constructs with the specified media type.
     * @param mediaType The media type.
     * @param simpleName The simple name for the format.
     */
    Format(String mediaType, String simpleName) {
      this.mediaType = mediaType;
      this.simpleName = simpleName;
    }

    /**
     * Returns the associated media type.
     *
     * @return The associated media type.
     */
    public String getMediaType() {
      return this.mediaType;
    }

    /**
     * Returns the simple name for the format.
     *
     * @return The simple name for the format.
     */
    public String getSimpleName() { return this.simpleName; }

    /**
     * Initializes the lookup.
     */
    static {
      try {
        Map map = new LinkedHashMap<>();
        for (Format format: Format.values()) {
          map.put(format.getMediaType(), format);
        }
        MEDIA_TYPE_LOOKUP = Collections.unmodifiableMap(map);

      } catch (Exception e) {
        e.printStackTrace();
        throw new ExceptionInInitializerError(e);
      }
    }

    /**
     * Returns the {@link Format} for the specified media type or null
     * if no format is associated with the media type.  This method returns
     * null if null is specified as the media type.
     *
     * @param mediaType The media type for which the {@link Format} is being
     *                  requested.
     *
     * @return The associated {@link Format} for the media type, or
     *         null if there is none or if the specified parameter is
     *         null
     */
    public static Format fromMediaType(String mediaType) {
      if (mediaType == null) return null;
      return MEDIA_TYPE_LOOKUP.get(mediaType.trim().toLowerCase());
    }
  }

  /**
   * The format for the records.
   */
  private Format format = null;

  /**
   * The backing character reader.
   */
  private Reader reader;

  /**
   * The mapping for the data sources.
   */
  private Map dataSourceMap;

  /**
   * The source ID to assign to the records.
   */
  private String sourceId;

  /**
   * The backing {@link RecordProvider}.
   */
  private RecordProvider recordProvider;

  /**
   * Constructs a {@link RecordReader} with the specified {@link Reader}.
   * The format of the reader is inferred using the first character read.
   *
   * @param reader The {@link Reader} from which to read the text for the
   *               records.
   *
   * @throws IOException If an I/O failure occurs.
   */
  public RecordReader(Reader reader) throws IOException {
    this(null,
         reader,
         Collections.emptyMap(),
         null);
  }

  /**
   * Constructs a {@link RecordReader} with the specified {@link Format} and
   * {@link Reader}.
   *
   * @param format The expected format of the records.
   *
   * @param reader The {@link Reader} from which to read the text for the
   *               records.
   *
   * @throws IOException If an I/O failure occurs.
   */
  public RecordReader(Format format, Reader reader) throws IOException {
    this(format,
         reader,
         Collections.emptyMap(),
         null);
  }

  /**
   * Constructs a {@link RecordReader} with the specified {@link Reader} and
   * data source code.  The format of the reader is inferred from the first
   * character read.
   *
   * @param reader The {@link Reader} from which to read the text for the
   *               records.
   *
   * @param dataSource The data source to assign to each record.
   *
   * @throws IOException If an I/O failure occurs.
   */
  public RecordReader(Reader reader, String dataSource)
      throws IOException
  {
    this(null,
         reader,
         Collections.singletonMap("", dataSource),
         null);
  }

  /**
   * Constructs a {@link RecordReader} with the specified {@link Format},
   * {@link Reader} and data source code.
   *
   * @param format The expected format of the records.
   *
   * @param reader The {@link Reader} from which to read the text for the
   *               records.
   *
   * @param dataSource The data source to assign to each record.
   *
   * @throws IOException If an I/O failure occurs.
   */
  public RecordReader(Format format, Reader reader, String dataSource)
      throws IOException
  {
    this(format,
         reader,
         Collections.singletonMap("", dataSource),
         null);
  }

  /**
   * Constructs a {@link RecordReader} with the specified {@link Reader},
   * data source code and source ID.  The format of the reader is inferred from
   * the first character.
   *
   * @param reader The {@link Reader} from which to read the text for the
   *               records.
   *
   * @param dataSource The data source to assign to each record.
   *
   * @param sourceId the source ID to assign to each record.
   *
   * @throws IOException If an I/O failure occurs.
   */
  public RecordReader(Reader reader, String dataSource, String sourceId)
      throws IOException
  {
    this(null,
         reader,
         Collections.singletonMap("", dataSource),
         sourceId);
  }

  /**
   * Constructs a {@link RecordReader} with the specified {@link Format},
   * {@link Reader}, data source and source ID.
   *
   * @param format The expected format of the records.
   *
   * @param reader The {@link Reader} from which to read the text for the
   *               records.
   *
   * @param dataSource The data source to assign to each record.
   *
   * @param sourceId the source ID to assign to each record.
   *
   * @throws IOException If an I/O failure occurs.
   */
  public RecordReader(Format  format,
                      Reader  reader,
                      String  dataSource,
                      String  sourceId)
      throws IOException
  {
    this(format,
         reader,
         Collections.singletonMap("", dataSource),
         sourceId);
  }

  /**
   * Constructs a {@link RecordReader} with the specified {@link Reader}
   * and data source code map.  The format of the reader is inferred from
   * the first character.
   *
   * @param reader The {@link Reader} from which to read the text for the
   *               records.
   *
   * @param dataSourceMap The map of original data source codes to replacement
   *                      data source codes.  The mapping from empty-string will
   *                      be used for any record that has no data source.  The
   *                      mapping from null will be for any data
   *                      source (including no data source) that has no key in
   *                      the map.
   *
   * @throws IOException If an I/O failure occurs.
   */
  public RecordReader(Reader reader, Map dataSourceMap)
      throws IOException
  {
    this(null, reader, dataSourceMap, null);
  }

  /**
   * Constructs a {@link RecordReader} with the specified {@link Format},
   * {@link Reader} and data source code map.
   *
   * @param format The expected format of the records.
   *
   * @param reader The {@link Reader} from which to read the text for the
   *               records.
   *
   * @param dataSourceMap The map of original data source names to replacement
   *                      data source name.  The mapping from empty-string will
   *                      be used for any record that has no data source or
   *                      whose data source is not in the map.
   *
   * @throws IOException If an I/O failure occurs.
   */
  public RecordReader(Format              format,
                      Reader              reader,
                      Map dataSourceMap)
      throws IOException
  {
    this(format, reader, dataSourceMap,null);
  }

  /**
   * Constructs a {@link RecordReader} with the specified {@link Reader},
   * data source code map and source ID.  The format of the reader is inferred
   * using the first character read.
   *
   * @param reader The {@link Reader} from which to read the text for the
   *               records.
   *
   * @param dataSourceMap The map of original data source names to replacement
   *                      data source name.  The mapping from empty-string will
   *                      be used for any record that has no data source or
   *                      whose data source is not in the map.
   *
   * @param sourceId the source ID to assign to each record.
   *
   * @throws IOException If an I/O failure occurs.
   */
  public RecordReader(Reader              reader,
                      Map dataSourceMap,
                      String              sourceId)
      throws IOException
  {
    this(null, reader, dataSourceMap, sourceId);
  }

  /**
   * Constructs a {@link RecordReader} with the specified {@link Format},
   * {@link Reader}, data source map, and source ID.  The format is explicitly
   * specified by the first parameter.
   *
   * @param format The expected format of the records.
   *
   * @param reader The {@link Reader} from which to read the text for the
   *               records.
   *
   * @param dataSourceMap The map of original data source names to replacement
   *                      data source name.  The mapping from empty-string will
   *                      be used for any record that has no data source or
   *                      whose data source is not in the map.
   *
   * @param sourceId the source ID to assign to each record.
   *
   * @throws IOException If an I/O failure occurs.
   */
  public RecordReader(Format              format,
                      Reader              reader,
                      Map dataSourceMap,
                      String              sourceId)
      throws IOException
  {
    // set the format
    this.format = format;

    // if the format is unknown then try to infer it
    if (this.format == null) {
      // use a push-back reader so we can read the first non-whitespace
      // character and then push it back to read again later
      PushbackReader pushbackReader = new PushbackReader(reader);
      this.reader = pushbackReader;

      // read characters until the format is set or we hit EOF
      while (this.format == null) {
        // read the next character
        int nextChar = pushbackReader.read();

        // check for EOF
        if (nextChar < 0) {
          break;
        }

        // if whitespace then skip it
        if (Character.isWhitespace((char) nextChar)) continue;

        // if not whitespace then unread the character
        pushbackReader.unread(nextChar);

        // switch on the character to determine the format
        switch ((char) nextChar) {
          case '[':
            this.format = Format.JSON;
            break;
          case '{':
            this.format = Format.JSON_LINES;
            break;
          default:
            this.format = Format.CSV;
        }
      }

    } else {
      // just set the reader
      this.reader = reader;
    }

    // default to JSON format if EOF detected
    if (this.format != null) {
      switch (this.format) {
        case JSON:
          this.recordProvider = new JsonArrayRecordProvider(this.reader);
          break;
        case JSON_LINES:
          this.recordProvider = new JsonLinesRecordProvider(this.reader);
          break;
        case CSV:
          this.recordProvider = new CsvRecordProvider(this.reader);
          break;
        default:
          throw new IllegalStateException(
              "Unrecognized RecordReader.Format; " + this.format);
      }
    } else {
      // set the format to JSON-Lines
      this.format = Format.JSON_LINES;

      // use a JSON-lines record provider if the format is null
      this.recordProvider = new JsonLinesRecordProvider(this.reader);
    }

    // initialize the data source map with upper-case keys
    this.dataSourceMap = (dataSourceMap == null) ? Collections.emptyMap()
        : new LinkedHashMap<>();
    try {
      if (dataSourceMap != null) {
        dataSourceMap.entrySet().forEach(entry -> {
          String key = entry.getKey();
          if (key != null) key = key.trim().toUpperCase();
          String value = entry.getValue().trim().toUpperCase();
          this.dataSourceMap.put(key, value);
        });
        this.dataSourceMap = Collections.unmodifiableMap(this.dataSourceMap);
      }

    } catch (NullPointerException e) {
      System.err.println("DATA SOURCE MAP: " + dataSourceMap);
      throw e;
    }
    this.sourceId = sourceId;
    if (this.sourceId != null) {
      this.sourceId = this.sourceId.trim();
      if (this.sourceId.length() == 0) {
        this.sourceId = null;
      }
    }
  }

  /**
   * Returns the {@link Format} of the records.
   *
   * @return The {@link Format} of the records.
   */
  public Format getFormat() {
    return this.format;
  }

  /**
   * Reads the next record and returns null if there are no more
   * records.
   *
   * @return The next record and returns null if there are no more
   *         records.
   */
  public JsonObject readRecord() {
    return this.recordProvider.getNextRecord();
  }

  /**
   * Gets the line number of an error after calling {@link #readRecord()}.
   * This returns null if there was no error after calling {@link
   * #readRecord()} and will return null if {@link #readRecord()}
   * has never been called.
   *
   * @return The line number associated with the error on the last attempt to
   *         get a record, or null if there was no error.
   */
  public Long getErrorLineNumber() {
    return this.recordProvider.getErrorLineNumber();
  }

  /**
   * A interface for providing records.
   */
  private interface RecordProvider {
    /**
     * Gets the next record as a {@link JsonObject}.
     * @return The next {@link JsonObject} record.
     */
    JsonObject getNextRecord();

    /**
     * Gets the line number of an error after calling {@link #getNextRecord()}.
     * This returns null if there was no error after calling {@link
     * #getNextRecord()} and will return null if {@link
     * #getNextRecord()} has never been called.
     *
     * @return The line number associated with the error on the last attempt to
     *         get a record, or null if there was no error.
     */
    Long getErrorLineNumber();
  }

  /**
   * Augments the specified record with "DATA_SOURCE",
   * "ENTITY_TYPE" and "SOURCE_ID" as appropriate.
   *
   * @param record The {@link JsonObject} record to be updated.
   * @return The updated {@link JsonObject} record.
   */
  private JsonObject augmentRecord(JsonObject record)
  {
    if (record == null) return null;
    JsonObjectBuilder job = Json.createObjectBuilder(record);
    String dsrc = JsonUtilities.getString(record, "DATA_SOURCE", "");
    dsrc = dsrc.trim().toUpperCase();

    // get the mapped data source
    String dataSource = this.dataSourceMap.get(dsrc);
    if (dataSource == null) dataSource = this.dataSourceMap.get(null);
    if (dataSource != null && dataSource.trim().length() == 0) {
      dataSource = null;
    }

    // remap the data source
    if (dataSource != null) {
      job.remove("DATA_SOURCE");
      job.add("DATA_SOURCE", dataSource);
      dsrc = dataSource;
    }

    // set the source ID
    if (this.sourceId != null) {
      job.remove("SOURCE_ID");
      job.add("SOURCE_ID", this.sourceId);
    }

    // build the object
    return job.build();
  }

  /**
   * A {@link RecordProvider} implementation for records when reading
   * a JSON array.
   */
  private class JsonArrayRecordProvider implements RecordProvider
  {
    /**
     * Iterator over {@link JsonObject} records.
     */
    private Iterator recordIter;

    /**
     * Indicates whether or not the JSON properly parses to avoid
     */
    private boolean errant = false;

    /**
     * The line number for the last error.
     */
    private Long errorLineNumber = null;

    /**
     * Constructor.
     */
    public JsonArrayRecordProvider(Reader reader) {
      JsonParserFactory jpf = Json.createParserFactory(Collections.emptyMap());
      JsonParser jp = jpf.createParser(reader);
      jp.next();
      this.recordIter = jp.getArrayStream()
          .map(jv -> (JsonObject) jv).iterator();
    }

    /**
     * Gets the next record from the JSON array.
     * @return The next {@link JsonObject} from the array.
     */
    public JsonObject getNextRecord() {
      RecordReader owner = RecordReader.this;
      JsonObject result = null;
      while (result == null) {
        try {
          if (!recordIter.hasNext()) break;
          result = owner.augmentRecord(this.recordIter.next());
          this.errant = false; // clear the errant flag

        } catch (Exception e) {
          if (this.errant) continue;
          if (e instanceof JsonParsingException) {
            JsonParsingException jpe = (JsonParsingException) e;
            System.out.println("LOCATION: " + jpe.getLocation());
            System.out.println("LINE NUMBER: " + jpe.getLocation().getLineNumber());
            this.errorLineNumber = jpe.getLocation().getLineNumber();
          }
          this.errant = true;
          throw e;
        }
      }
      return result;
    }

    @Override
    public Long getErrorLineNumber() {
      return this.errorLineNumber;
    }
  }

  /**
   * A {@link RecordProvider} implementation for records when reading
   * a files in a "JSON lines" format.
   */
  private class JsonLinesRecordProvider implements RecordProvider {
    /**
     * The backing {@link BufferedReader} for reading the lines from the file.
     */
    private BufferedReader reader;

    /**
     * The current line number.
     */
    private long lineNumber = 0;

    /**
     * The error line number if an error is found.
     */
    private Long errorLineNumber = null;

    /**
     * Default constructor.
     */
    public JsonLinesRecordProvider(Reader reader) {
      this.reader = new BufferedReader(reader);
    }

    /**
     * Implemented to get the next line from the file and parse it as
     * a {@link JsonObject} record.
     *
     * @return The next {@link JsonObject} record.
     */
    public JsonObject getNextRecord() {
      try {
        RecordReader  owner   = RecordReader.this;
        JsonObject    record  = null;

        while (this.reader != null && record == null) {
          // read the next line and check for EOF
          String line = this.reader.readLine();
          if (line == null) {
            this.reader.close();
            this.reader = null;
            continue;
          }
          this.lineNumber++;
          this.errorLineNumber = null; // clear the error line number

          // trim the line of extra whitespace
          line = line.trim();

          // check for blank lines and skip them
          if (line.length() == 0) continue;

          // check if the line begins with a "#" for a comment lines
          if (line.startsWith("#")) continue;

          // check if the line does NOT start with "{"
          if (!line.startsWith("{")) {
            throw new IllegalStateException(
                "Line does not appear to be JSON record: " + line);
          }

          // parse the line
          try {
            record = JsonUtilities.parseJsonObject(line);

          } catch (JsonParsingException e) {
            this.errorLineNumber = this.lineNumber;
            throw e;
          }
        }

        return owner.augmentRecord(record);

      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }

    @Override
    public Long getErrorLineNumber() {
      return this.errorLineNumber;
    }
  }

  /**
   * Implements {@link RecordProvider} for a CSV file.
   *
   */
  private class CsvRecordProvider implements RecordProvider {
    /**
     * The CSV parser.
     */
    private CSVParser parser;

    /**
     * The record iterator.
     */
    private Iterator recordIter;

    /**
     * The line number for the last error.
     */
    private Long errorLineNumber = null;

    public CsvRecordProvider(Reader reader) {
      CSVFormat csvFormat = CSVFormat.Builder.create(CSVFormat.DEFAULT)
          .setHeader().setSkipHeaderRecord(true).setIgnoreEmptyLines(true)
          .setTrim(true).setIgnoreSurroundingSpaces(true).build();

      try {
        this.parser = new CSVParser(reader, csvFormat);
        Map headerMap = this.parser.getHeaderMap();
        Set headers = new HashSet<>();
        headerMap.keySet().forEach(h -> {
          headers.add(h.toUpperCase());
        });
        this.recordIter = parser.iterator();

      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }

    public JsonObject getNextRecord() {
      RecordReader owner = RecordReader.this;
      this.errorLineNumber = null;
      try {
        if (!this.recordIter.hasNext()) return null;
        CSVRecord record = this.recordIter.next();
        Map recordMap = record.toMap();
        Iterator> entryIter
            = recordMap.entrySet().iterator();
        while (entryIter.hasNext()) {
          Map.Entry entry = entryIter.next();
          String value = entry.getValue();
          if (value == null || value.trim().length() == 0) {
            entryIter.remove();
          }
        }

        @SuppressWarnings("unchecked")
        Map map = (Map) recordMap;

        JsonObject jsonObj = Json.createObjectBuilder(map).build();

        JsonObject result = owner.augmentRecord(jsonObj);

        return result;

      } catch (RuntimeException e) {
        this.errorLineNumber = this.parser.getCurrentLineNumber();
        throw e;

      } catch (Exception e) {
        this.errorLineNumber = this.parser.getCurrentLineNumber();
        throw new RuntimeException(e);
      }
    }

    @Override
    public Long getErrorLineNumber() {
      return this.errorLineNumber;
    }
  }

  /**
   * A simple main function to test this class manually.
   *
   * @param args The command-line arguments.
   */
  public static void main(String[] args) {
    try {
      for (String arg: args) {
        File file = new File(arg);
        try (FileInputStream fis = new FileInputStream(file);
             InputStreamReader isr = new InputStreamReader(fis))
        {
          RecordReader recordReader = new RecordReader(isr);
          System.out.println();
          System.out.println("----------------------------------------------");
          System.out.println("FILE    : " + file);
          System.out.println("FORMAT  : " + recordReader.getFormat());
          System.out.println();
          int index = 0;
          for (JsonObject record = recordReader.readRecord();
               record != null;
               record = recordReader.readRecord())
          {
            index++;
            System.out.println(index + ": " + JsonUtilities.toJsonText(record));
            System.out.println();
          }
          System.out.println("COUNT   : " + index);
          System.out.println("----------------------------------------------");
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy