org.apache.druid.data.input.IntermediateRowParsingReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of druid-processing Show documentation
Show all versions of druid-processing Show documentation
A module that is everything required to understands Druid Segments
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.data.input;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.UOE;
import org.apache.druid.java.util.common.parsers.CloseableIterator;
import org.apache.druid.java.util.common.parsers.CloseableIteratorWithMetadata;
import org.apache.druid.java.util.common.parsers.ParseException;
import org.apache.druid.utils.CollectionUtils;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
/**
* {@link InputEntityReader} that parses bytes into some intermediate rows first, and then into {@link InputRow}s.
* For example, {@link org.apache.druid.data.input.impl.DelimitedValueReader} parses bytes into string lines, and then parses
* those lines into InputRows.
*
* @param type of intermediate row. For example, it can be {@link String} for text formats.
*/
public abstract class IntermediateRowParsingReader implements InputEntityReader
{
@Override
public CloseableIterator read() throws IOException
{
final CloseableIteratorWithMetadata intermediateRowIteratorWithMetadata = intermediateRowIteratorWithMetadata();
return new CloseableIterator()
{
// since parseInputRows() returns a list, the below line always iterates over the list,
// which means it calls Iterator.hasNext() and Iterator.next() at least once per row.
// This could be unnecessary if the row wouldn't be exploded into multiple inputRows.
// If this line turned out to be a performance bottleneck, perhaps parseInputRows() interface might not be a
// good idea. Subclasses could implement read() with some duplicate codes to avoid unnecessary iteration on
// a singleton list.
Iterator rows = null;
long currentRecordNumber = 1;
@Override
public boolean hasNext()
{
if (rows == null || !rows.hasNext()) {
if (!intermediateRowIteratorWithMetadata.hasNext()) {
return false;
}
final T row = intermediateRowIteratorWithMetadata.next();
try {
rows = parseInputRows(row).iterator();
++currentRecordNumber;
}
catch (IOException e) {
final Map metadata = intermediateRowIteratorWithMetadata.currentMetadata();
rows = new ExceptionThrowingIterator(new ParseException(
String.valueOf(row),
e,
buildParseExceptionMessage(
StringUtils.format("Unable to parse row [%s]", row),
source(),
currentRecordNumber,
metadata
)
));
}
catch (ParseException e) {
final Map metadata = intermediateRowIteratorWithMetadata.currentMetadata();
// Replace the message of the ParseException e
rows = new ExceptionThrowingIterator(
new ParseException(
e.getInput(),
e.isFromPartiallyValidRow(),
buildParseExceptionMessage(e.getMessage(), source(), currentRecordNumber, metadata)
));
}
}
return true;
}
@Override
public InputRow next()
{
if (!hasNext()) {
throw new NoSuchElementException();
}
return rows.next();
}
@Override
public void close() throws IOException
{
intermediateRowIteratorWithMetadata.close();
}
};
}
@Override
public CloseableIterator sample() throws IOException
{
final CloseableIteratorWithMetadata delegate = intermediateRowIteratorWithMetadata();
return new CloseableIterator()
{
@Override
public void close() throws IOException
{
delegate.close();
}
@Override
public boolean hasNext()
{
return delegate.hasNext();
}
@Override
public InputRowListPlusRawValues next()
{
if (!hasNext()) {
throw new NoSuchElementException();
}
return sampleIntermediateRow(delegate.next(), delegate.currentMetadata());
}
};
}
/**
* Parses and samples the intermediate row and returns input row and the raw values in it. Metadata supplied can
* contain information about the source which will get surfaced in case an exception occurs while parsing the
* intermediate row
*
* @param row intermediate row
* @param metadata additional information about the source and the record getting parsed
* @return sampled data from the intermediate row
*/
private InputRowListPlusRawValues sampleIntermediateRow(T row, Map metadata)
{
final List