All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.jcustenborder.kafka.connect.spooldir.SpoolDirCsvSourceTask Maven / Gradle / Ivy

The newest version!
/**
 * Copyright © 2016 Jeremy Custenborder ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.github.jcustenborder.kafka.connect.spooldir;

import com.google.common.base.Joiner;
import com.opencsv.CSVParser;
import com.opencsv.CSVReader;
import com.opencsv.CSVReaderBuilder;
import org.apache.kafka.connect.data.Field;
import org.apache.kafka.connect.data.Struct;
import org.apache.kafka.connect.errors.DataException;
import org.apache.kafka.connect.source.SourceRecord;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

public class SpoolDirCsvSourceTask extends SpoolDirSourceTask {
  String[] fieldNames;
  private CSVParser csvParser;
  private CSVReader csvReader;
  private InputStreamReader streamReader;
  private Map fileMetadata;


  @Override
  protected SpoolDirCsvSourceConnectorConfig config(Map settings) {
    return new SpoolDirCsvSourceConnectorConfig(true, settings);
  }

  @Override
  protected void configure(InputStream inputStream, Map metadata, final Long lastOffset) throws IOException {
    log.trace("configure() - creating csvParser");
    this.csvParser = this.config.createCSVParserBuilder().build();
    this.streamReader = new InputStreamReader(inputStream, this.config.charset);
    CSVReaderBuilder csvReaderBuilder = this.config.createCSVReaderBuilder(this.streamReader, csvParser);
    this.csvReader = csvReaderBuilder.build();

    String[] fieldNames;

    if (this.config.firstRowAsHeader) {
      log.trace("configure() - Reading the header row.");
      fieldNames = this.csvReader.readNext();
      log.info("configure() - field names from header row. fields = {}", Joiner.on(", ").join(fieldNames));
    } else {
      log.trace("configure() - Using fields from schema {}", this.config.valueSchema.name());
      fieldNames = new String[this.config.valueSchema.fields().size()];
      int index = 0;
      for (Field field : this.config.valueSchema.fields()) {
        fieldNames[index++] = field.name();
      }
      log.info("configure() - field names from schema order. fields = {}", Joiner.on(", ").join(fieldNames));
    }

    if (null != lastOffset) {
      log.info("Found previous offset. Skipping {} line(s).", lastOffset.intValue());
      String[] row = null;
      while (null != (row = this.csvReader.readNext()) && this.csvReader.getLinesRead() < lastOffset) {
        log.trace("skipped row");
      }
    }

    this.fieldNames = fieldNames;
    this.fileMetadata = metadata;
  }

  @Override
  public void start(Map settings) {
    super.start(settings);
  }

  @Override
  public long recordOffset() {
    return this.csvReader.getLinesRead();
  }

  @Override
  public List process() throws IOException {
    List records = new ArrayList<>(this.config.batchSize);

    while (records.size() < this.config.batchSize) {
      String[] row = this.csvReader.readNext();

      if (row == null) {
        break;
      }
      log.trace("process() - Row on line {} has {} field(s)", recordOffset(), row.length);

      Struct keyStruct = new Struct(this.config.keySchema);
      Struct valueStruct = new Struct(this.config.valueSchema);

      for (int i = 0; i < this.fieldNames.length; i++) {
        String fieldName = this.fieldNames[i];
        log.trace("process() - Processing field {}", fieldName);
        String input = row[i];
        log.trace("process() - input = '{}'", input);
        Object fieldValue = null;

        try {
          Field field = this.config.valueSchema.field(fieldName);
          if (null != field) {
            fieldValue = this.parser.parseString(field.schema(), input);
            log.trace("process() - output = '{}'", fieldValue);
            valueStruct.put(field, fieldValue);
          } else {
            log.trace("process() - Field {} is not defined in the schema.", fieldName);
          }
        } catch (Exception ex) {
          String message = String.format("Exception thrown while parsing data for '%s'. linenumber=%s", fieldName, this.recordOffset());
          throw new DataException(message, ex);
        }

        Field keyField = this.config.keySchema.field(fieldName);
        if (null != keyField) {
          log.trace("process() - Setting key field '{}' to '{}'", keyField.name(), fieldValue);
          keyStruct.put(keyField, fieldValue);
        }
      }

      if (log.isInfoEnabled() && this.csvReader.getLinesRead() % ((long) this.config.batchSize * 20) == 0) {
        log.info("Processed {} lines of {}", this.csvReader.getLinesRead(), this.fileMetadata);
      }

      addRecord(records, keyStruct, valueStruct);

    }
    return records;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy