com.cobber.fta.driver.FileProcessor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cli Show documentation
Show all versions of cli Show documentation
Analyze Text data to determine simple type and Semantic type information as well as other key metrics associated with a text stream.
/*
* Copyright 2017-2024 Tim Segall
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cobber.fta.driver;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.MessageFormat;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import com.cobber.fta.TextAnalysisResult;
import com.cobber.fta.TextAnalyzer;
import com.cobber.fta.core.CircularBuffer;
import com.cobber.fta.core.FTAMergeException;
import com.cobber.fta.core.FTAPluginException;
import com.cobber.fta.core.FTAUnsupportedLocaleException;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.univocity.parsers.common.TextParsingException;
import com.univocity.parsers.csv.CsvParser;
import com.univocity.parsers.csv.CsvParserSettings;
import com.univocity.parsers.csv.UnescapedQuoteHandling;
class FileProcessor {
private final DriverOptions options;
private final PrintStream error;
private final String filename;
private PrintStream output;
FileProcessor(final PrintStream error, final String filename, final DriverOptions cmdLineOptions) {
this.error = error;
this.filename = filename;
this.options = new DriverOptions(cmdLineOptions);
}
protected void process() throws IOException, FTAPluginException, FTAUnsupportedLocaleException, FTAProcessingException, FTAMergeException {
if (Files.exists(Paths.get(filename + ".options"))) {
options.addFromFile(filename + ".options");
}
output = options.output ? new PrintStream(filename + ".out") : System.out;
final CsvParserSettings settings = new CsvParserSettings();
settings.setHeaderExtractionEnabled(true);
settings.detectFormatAutomatically();
settings.setLineSeparatorDetectionEnabled(true);
settings.setIgnoreLeadingWhitespaces(false);
settings.setIgnoreTrailingWhitespaces(false);
settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER);
// settings.setNullValue("");
settings.setEmptyValue("");
settings.setSkipEmptyLines(false);
if (options.delimiter != null) {
settings.getFormat().setDelimiter(options.delimiter.charAt(0));
settings.setDelimiterDetectionEnabled(false);
}
else
settings.setDelimiterDetectionEnabled(true, ',', '\t', '|', ';');
if (options.quoteChar != null) {
settings.getFormat().setQuote(options.quoteChar.charAt(0));
settings.setQuoteDetectionEnabled(false);
}
if (options.xMaxCharsPerColumn != -1)
settings.setMaxCharsPerColumn(options.xMaxCharsPerColumn);
settings.setMaxColumns(options.xMaxColumns);
try {
if (options.bulk)
processBulk(settings);
else
processAllFields(settings);
}
catch (Exception e) {
if (options.output)
output.close();
throw e;
}
}
class RowCount {
int numFields;
long firstRow;
long count;
RowCount(final int numFields, final long firstRow) {
this.numFields = numFields;
this.firstRow = firstRow;
this.count = 1;
}
RowCount inc() {
this.count++;
return this;
}
}
private void processBulk(final CsvParserSettings settings) throws IOException, FTAPluginException, FTAUnsupportedLocaleException, FTAProcessingException {
String[] header;
int numFields;
TextAnalyzer analyzer;
TextAnalysisResult result;
String previousKey = null;
String key;
String previousName = null;
String name = null;
final Map bulkMap = new HashMap<>();
try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filename)), options.charset))) {
final CsvParser parser = new CsvParser(settings);
parser.beginParsing(in);
header = parser.getRecordMetadata().headers();
if (header.length != 4)
throw new FTAProcessingException(filename,
MessageFormat.format("Expected input with four columns (key,fieldName,fieldValue,fieldCount). {0} field(s) in input",
header.length));
numFields = header.length;
long thisRecord = 0;
long totalCount = 0;
String[] row;
final Map errors = new HashMap<>();
while ((row = parser.parseNext()) != null) {
thisRecord++;
if (row.length != numFields) {
final RowCount existing = errors.get(row.length);
if (existing == null)
errors.put(row.length, new RowCount(row.length, thisRecord));
else
errors.put(row.length, existing.inc());
continue;
}
key = row[0];
name = row[1];
final String fieldValue = row[2];
final Long fieldCount = Long.valueOf(row[3].trim());
totalCount += fieldCount;
if (previousKey == null || !key.equals(previousKey)) {
if (!bulkMap.isEmpty()) {
analyzer = new TextAnalyzer(previousName);
options.apply(analyzer);
analyzer.trainBulk(bulkMap);
analyzer.setTotalCount(totalCount);
result = analyzer.getResult();
output.printf("Field '%s' - %s%n", sanitize(analyzer.getStreamName()), result.asJSON(options.pretty, options.verbose));
totalCount = 0;
}
bulkMap.clear();
previousKey = key;
previousName = name;
}
bulkMap.put(fieldValue, fieldCount);
}
if (!errors.isEmpty()) {
for (final RowCount recordError : errors.values())
error.printf("ERROR: File: '%s', %d records skipped with %d fields, first occurrence %d, expected %d%n",
filename, recordError.count, recordError.numFields, recordError.firstRow, numFields);
}
if (!bulkMap.isEmpty()) {
analyzer = new TextAnalyzer(name);
options.apply(analyzer);
analyzer.trainBulk(bulkMap);
analyzer.setTotalCount(totalCount);
result = analyzer.getResult();
output.printf("Field '%s' - %s%n", sanitize(analyzer.getStreamName()), result.asJSON(options.pretty, options.verbose));
}
}
}
private String sanitize(final String input) {
if (input == null || input.isEmpty())
return input;
final StringBuilder b = new StringBuilder();
for (int i = 0; i < input.length(); i++) {
if (input.charAt(i) == '\n')
b.append("%0A");
else if (input.charAt(i) == '\r')
b.append("%0D");
else
b.append(input.charAt(i));
}
return b.toString();
}
private void processAllFields(final CsvParserSettings settings) throws IOException, FTAPluginException, FTAUnsupportedLocaleException, FTAProcessingException, FTAMergeException {
final long startTime = System.currentTimeMillis();
long initializedTime = -1;
long consumedTime = -1;
long resultsTime = -1;
Processor processor = null;
Processor altProcessor = null;
String[] header = null;
int numFields = 0;
long rawRecordIndex = 0;
final Map errors = new HashMap<>();
try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filename)), options.charset))) {
// Skip the first lines if requested
if (options.skip != 0) {
for (int i = 0; i < options.skip; i++)
in.readLine();
rawRecordIndex += options.skip;
}
final CsvParser parser = new CsvParser(settings);
parser.beginParsing(in);
header = parser.getRecordMetadata().headers();
if (header == null)
throw new FTAProcessingException(filename, "Cannot parse header");
numFields = header.length;
if (options.col > numFields)
throw new FTAProcessingException(filename, MessageFormat.format("Column {0} does not exist. Only {1} field(s) in input.", options.col, numFields));
for (int i = 0; i < numFields; i++) {
if ((options.col == -1 || options.col == i) && options.verbose != 0 && options.noAnalysis)
System.out.println(header[i]);
}
processor = new Processor(com.cobber.fta.core.Utils.getBaseName(Paths.get(filename).getFileName().toString()), header, options);
if (options.testmerge)
altProcessor = new Processor(com.cobber.fta.core.Utils.getBaseName(Paths.get(filename).getFileName().toString()), header, options);
initializedTime = System.currentTimeMillis();
final CircularBuffer buffer = new CircularBuffer(options.trailer + 1);
String[] row;
int processedRecords = 0;
while ((row = parser.parseNext()) != null) {
rawRecordIndex++;
// Skip blank lines
if (row.length == 1 && row[0] == null)
continue;
if (row.length != numFields) {
final RowCount existing = errors.get(row.length);
if (existing == null)
errors.put(row.length, new RowCount(row.length, rawRecordIndex));
else
errors.put(row.length, existing.inc());
continue;
}
buffer.add(row);
if (!buffer.isFull())
continue;
row = buffer.get();
processedRecords++;
if (options.testmerge) {
if (processedRecords % 2 == 0)
processor.consume(row);
else
altProcessor.consume(row);
}
else
processor.consume(row);
if (processedRecords == options.recordsToProcess) {
parser.stopParsing();
break;
}
}
consumedTime = System.currentTimeMillis();
}
catch (FileNotFoundException e) {
throw new FTAProcessingException(filename, "File not found", e);
}
catch (TextParsingException|ArrayIndexOutOfBoundsException e) {
throw new FTAProcessingException(filename, "Univocity exception", e);
}
if (options.testmerge)
processor = Processor.merge(processor, altProcessor);
if (!errors.isEmpty()) {
long toSkip = -1;
for (final RowCount recordError : errors.values()) {
error.printf("ERROR: File: '%s', %d records skipped with %d fields, first occurrence %d, expected %d%n",
filename, recordError.count, recordError.numFields, recordError.firstRow, numFields);
if (rawRecordIndex > 20 && recordError.count > .8 * rawRecordIndex)
toSkip = recordError.firstRow;
}
if (toSkip != -1)
error.printf("ERROR: File: '%s', retry with --skip %d%n", filename, toSkip);
}
if (options.noAnalysis)
System.exit(0);
// Validate the result of the analysis if requested
final int[] matched = new int[numFields];
final int[] nulls = new int[numFields];
final int[] blanks = new int[numFields];
final Set failures = new HashSet<>();
// Check the RegExp at level 2 validation
if (options.validate == 2) {
try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filename)), options.charset))) {
final CsvParser parser = new CsvParser(settings);
parser.beginParsing(in);
numFields = parser.getRecordMetadata().headers().length;
final TextAnalysisResult[] results = processor.getResult();
final Pattern[] patterns = new Pattern[numFields];
for (int i = 0; i < numFields; i++)
if (options.col == -1 || options.col == i)
patterns[i] = Pattern.compile(results[i].getRegExp());
rawRecordIndex = 0;
String[] row;
while ((row = parser.parseNext()) != null) {
rawRecordIndex++;
if (row.length != numFields)
continue;
for (int i = 0; i < numFields; i++) {
if (options.col == -1 || options.col == i) {
final String value = row[i];
if (value == null)
nulls[i]++;
else if (value.trim().isEmpty())
blanks[i]++;
else if (patterns[i].matcher(value).matches())
matched[i]++;
else if (options.verbose != 0)
failures.add(value);
}
}
if (rawRecordIndex == options.recordsToProcess) {
parser.stopParsing();
break;
}
}
}
}
int typesDetected = 0;
long matchCount = 0;
long sampleCount = 0;
TextAnalysisResult result = null;
if (options.json)
output.printf("[%n");
final TextAnalysisResult[] results = processor.getResult();
for (int i = 0; i < numFields; i++) {
if (options.col == -1 || options.col == i) {
final TextAnalyzer analyzer = processor.getAnalyzer(i);
if (rawRecordIndex != options.recordsToProcess)
analyzer.setTotalCount(rawRecordIndex);
result = results[i];
if (options.json) {
if (i != 0 && options.col == -1)
output.printf(",");
}
else
output.printf("Field '%s' (%d) - ", sanitize(analyzer.getStreamName()), i);
output.printf("%s%n", result.asJSON(options.pretty, options.verbose));
if (options.pluginDefinition) {
final ObjectNode pluginDefinition = result.asPlugin(analyzer);
if (pluginDefinition != null) {
final ObjectMapper mapper = new ObjectMapper();
final ObjectWriter writer = mapper.writerWithDefaultPrettyPrinter();
try {
output.printf("%s%n", writer.writeValueAsString(pluginDefinition));
} catch (JsonProcessingException e) {
throw new FTAProcessingException(filename, "JsonProcessing exception", e);
}
}
}
if (result.getType() != null)
typesDetected++;
matchCount += result.getMatchCount();
sampleCount += result.getSampleCount();
// Check the counts if we are validating
if (options.validate >= 1) {
final String ret = result.checkCounts();
if (ret != null)
throw new FTAProcessingException(filename,
MessageFormat.format("Composite: {0}, field: {1} ({2}), failed count validation - {3}",
analyzer.getContext().getCompositeName(), analyzer.getContext().getStreamName(),
analyzer.getContext().getStreamIndex(), ret));
}
if (options.validate == 2 && matched[i] != result.getMatchCount()) {
if (result.isSemanticType())
if (matched[i] > result.getMatchCount())
error.printf("\t*** Warning: Match Count via RegExp (%d) > LogicalType match analysis (%d) ***%n", matched[i], result.getMatchCount());
else
error.printf("\t*** Error: Match Count via RegExp (%d) < LogicalType match analysis (%d) ***%n", matched[i], result.getMatchCount());
else
error.printf("\t*** Error: Match Count via RegExp (%d) does not match analysis (%d) ***%n", matched[i], result.getMatchCount());
if (options.verbose != 0) {
error.println("Failed to match:");
for (final String failure : failures)
error.println("\t" + failure);
}
}
}
}
if (options.json)
output.printf("]%n");
resultsTime = System.currentTimeMillis();
final Runtime instance = Runtime.getRuntime();
final double usedMemory = (instance.totalMemory() - instance.freeMemory()) / (1024 * 1024);
final long durationTime = System.currentTimeMillis() - startTime;
if (!options.json) {
if (options.col == -1) {
final double percentage = numFields == 0 ? 0 : ((double)typesDetected*100)/numFields;
error.printf("Summary: File: %s, Types detected %d of %d (%.2f%%), Matched %d, Samples %d, Used Memory: %.2fMB.%n",
filename, typesDetected, numFields, percentage, matchCount, sampleCount, usedMemory);
}
else {
final double confidence = result == null ? 0 : result.getConfidence();
error.printf("Summary: Type detected: %s, Matched %d, Samples %d (Confidence: %.2f%%), Used Memory: %.2fMB.%n",
(typesDetected == 1 ? "yes" : "no"), matchCount,
sampleCount, confidence*100, usedMemory);
}
error.printf("Execution time (#fields: %d, #records: %d): initialization: %dms, consumption: %dms, results: %dms, total: %dms%n",
numFields, rawRecordIndex, initializedTime - startTime, consumedTime - initializedTime, resultsTime - consumedTime, durationTime);
}
}
}