org.apache.druid.data.input.TextReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of druid-processing Show documentation
Show all versions of druid-processing Show documentation
A module that is everything required to understands Druid Segments
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.data.input;
import com.google.common.base.Strings;
import org.apache.druid.data.input.impl.FastLineIterator;
import org.apache.druid.java.util.common.parsers.CloseableIterator;
import org.apache.druid.java.util.common.parsers.CloseableIteratorWithMetadata;
import org.apache.druid.java.util.common.parsers.ParseException;
import org.apache.druid.java.util.common.parsers.ParserUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
/**
* Abstract {@link InputEntityReader} for text format readers such as CSV or JSON.
*/
public abstract class TextReader extends IntermediateRowParsingReader
{
private final InputRowSchema inputRowSchema;
private final InputEntity source;
public TextReader(InputRowSchema inputRowSchema, InputEntity source)
{
this.inputRowSchema = inputRowSchema;
this.source = source;
}
public InputRowSchema getInputRowSchema()
{
return inputRowSchema;
}
@Override
public CloseableIteratorWithMetadata intermediateRowIteratorWithMetadata() throws IOException
{
final CloseableIterator delegate = new FastLineIterator(source.open());
final int numHeaderLines = getNumHeaderLinesToSkip();
for (int i = 0; i < numHeaderLines && delegate.hasNext(); i++) {
delegate.next(); // skip lines
}
if (needsToProcessHeaderLine() && delegate.hasNext()) {
processHeaderLine(delegate.next());
}
return new CloseableIteratorWithMetadata()
{
private static final String LINE_KEY = "Line";
private long currentLineNumber = numHeaderLines + (needsToProcessHeaderLine() ? 1 : 0);
@Override
public Map currentMetadata()
{
return Collections.singletonMap(LINE_KEY, currentLineNumber);
}
@Override
public boolean hasNext()
{
return delegate.hasNext();
}
@Override
public String next()
{
currentLineNumber++;
return delegate.next();
}
@Override
public void close() throws IOException
{
delegate.close();
}
};
}
@Override
protected InputEntity source()
{
return source;
}
/**
* Parses the given line into a list of {@link InputRow}s. Note that some file formats can explode a single line of
* input into multiple inputRows.
*
* This method will be called after {@link #getNumHeaderLinesToSkip()} and {@link #processHeaderLine}.
*/
@Override
public abstract List parseInputRows(String intermediateRow) throws IOException, ParseException;
/**
* Returns the number of header lines to skip.
* {@link #processHeaderLine} will be called as many times as the returned number.
*/
public abstract int getNumHeaderLinesToSkip();
/**
* Returns true if the file format needs to process a header line.
* This method will be called after skipping lines as many as {@link #getNumHeaderLinesToSkip()}.
*/
public abstract boolean needsToProcessHeaderLine();
/**
* Processes a header line. This will be called if {@link #needsToProcessHeaderLine()} = true.
*/
public abstract void processHeaderLine(String line) throws IOException;
public static List findOrCreateColumnNames(List parsedLine)
{
final List columns = new ArrayList<>(parsedLine.size());
for (int i = 0; i < parsedLine.size(); i++) {
if (Strings.isNullOrEmpty(parsedLine.get(i))) {
columns.add(ParserUtils.getDefaultColumnName(i));
} else {
columns.add(parsedLine.get(i));
}
}
if (columns.isEmpty()) {
return ParserUtils.generateFieldNames(parsedLine.size());
} else {
ParserUtils.validateFields(columns);
return columns;
}
}
}