All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opensearch.ingest.common.CsvProcessor Maven / Gradle / Ivy

/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*
 * Modifications Copyright OpenSearch Contributors. See
 * GitHub history for details.
 */

package org.opensearch.ingest.common;

import org.opensearch.ingest.AbstractProcessor;
import org.opensearch.ingest.ConfigurationUtils;
import org.opensearch.ingest.IngestDocument;
import org.opensearch.ingest.Processor;

import java.util.List;
import java.util.Map;

import static org.opensearch.ingest.ConfigurationUtils.newConfigurationException;

/**
 * A processor that breaks line from CSV file into separate fields.
 * If there's more fields requested than there is in the CSV, extra field will not be present in the document after processing.
 * In the same way this processor will skip any field that is empty in CSV.
 * 

* By default it uses rules according to RCF 4180 with one exception: whitespaces are * allowed before or after quoted field. Processor can be tweaked with following parameters: *

* quote: set custom quote character (defaults to ") * separator: set custom separator (defaults to ,) * trim: trim leading and trailing whitespaces in unquoted fields * empty_value: sets custom value to use for empty fields (field is skipped if null) */ public final class CsvProcessor extends AbstractProcessor { public static final String TYPE = "csv"; // visible for testing final String field; final String[] headers; final boolean trim; final char quote; final char separator; final boolean ignoreMissing; final Object emptyValue; CsvProcessor( String tag, String description, String field, String[] headers, boolean trim, char separator, char quote, boolean ignoreMissing, Object emptyValue ) { super(tag, description); this.field = field; this.headers = headers; this.trim = trim; this.quote = quote; this.separator = separator; this.ignoreMissing = ignoreMissing; this.emptyValue = emptyValue; } @Override public IngestDocument execute(IngestDocument ingestDocument) { if (headers.length == 0) { return ingestDocument; } String line = ingestDocument.getFieldValue(field, String.class, ignoreMissing); if (line == null && ignoreMissing) { return ingestDocument; } else if (line == null) { throw new IllegalArgumentException("field [" + field + "] is null, cannot process it."); } new CsvParser(ingestDocument, quote, separator, trim, headers, emptyValue).process(line); return ingestDocument; } @Override public String getType() { return TYPE; } public static final class Factory implements org.opensearch.ingest.Processor.Factory { @Override public CsvProcessor create( Map registry, String processorTag, String description, Map config ) { String field = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); String quote = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "quote", "\""); if (quote.length() != 1) { throw newConfigurationException(TYPE, processorTag, "quote", "quote has to be single character like \" or '"); } String separator = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "separator", ","); if (separator.length() != 1) { throw newConfigurationException(TYPE, processorTag, "separator", "separator has to be single character like , or ;"); } boolean trim = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "trim", false); Object emptyValue = null; if (config.containsKey("empty_value")) { emptyValue = ConfigurationUtils.readObject(TYPE, processorTag, config, "empty_value"); } boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); List targetFields = ConfigurationUtils.readList(TYPE, processorTag, config, "target_fields"); if (targetFields.isEmpty()) { throw newConfigurationException(TYPE, processorTag, "target_fields", "target fields list can't be empty"); } return new CsvProcessor( processorTag, description, field, targetFields.toArray(new String[0]), trim, separator.charAt(0), quote.charAt(0), ignoreMissing, emptyValue ); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy