All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.cobber.fta.driver.FileProcessor Maven / Gradle / Ivy

Go to download

Analyze Text data to determine simple type and Semantic type information as well as other key metrics associated with a text stream.

There is a newer version: 15.10.0
Show newest version
/*
 * Copyright 2017-2024 Tim Segall
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cobber.fta.driver;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.MessageFormat;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import com.cobber.fta.TextAnalysisResult;
import com.cobber.fta.TextAnalyzer;
import com.cobber.fta.core.CircularBuffer;
import com.cobber.fta.core.FTAMergeException;
import com.cobber.fta.core.FTAPluginException;
import com.cobber.fta.core.FTAUnsupportedLocaleException;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.univocity.parsers.common.TextParsingException;
import com.univocity.parsers.csv.CsvParser;
import com.univocity.parsers.csv.CsvParserSettings;
import com.univocity.parsers.csv.UnescapedQuoteHandling;

class FileProcessor {
	private final DriverOptions options;
	private final PrintStream error;
	private final String filename;
	private PrintStream output;

	FileProcessor(final PrintStream error, final String filename, final DriverOptions cmdLineOptions) {
		this.error = error;
		this.filename = filename;
		this.options = new DriverOptions(cmdLineOptions);
	}

	protected void process() throws IOException, FTAPluginException, FTAUnsupportedLocaleException, FTAProcessingException, FTAMergeException {
		if (Files.exists(Paths.get(filename + ".options"))) {
			options.addFromFile(filename + ".options");
		}

		output = options.output ? new PrintStream(filename + ".out") : System.out;

		final CsvParserSettings settings = new CsvParserSettings();
		settings.setHeaderExtractionEnabled(true);
		settings.detectFormatAutomatically();
		settings.setLineSeparatorDetectionEnabled(true);
		settings.setIgnoreLeadingWhitespaces(false);
		settings.setIgnoreTrailingWhitespaces(false);
		settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER);
//		settings.setNullValue("");
		settings.setEmptyValue("");
		settings.setSkipEmptyLines(false);
		if (options.delimiter != null) {
			settings.getFormat().setDelimiter(options.delimiter.charAt(0));
			settings.setDelimiterDetectionEnabled(false);
		}
		else
			settings.setDelimiterDetectionEnabled(true, ',', '\t', '|', ';');
		if (options.quoteChar != null) {
			settings.getFormat().setQuote(options.quoteChar.charAt(0));
			settings.setQuoteDetectionEnabled(false);
		}
		if (options.xMaxCharsPerColumn != -1)
			settings.setMaxCharsPerColumn(options.xMaxCharsPerColumn);

		settings.setMaxColumns(options.xMaxColumns);

		try {
			if (options.bulk)
				processBulk(settings);
			else
				processAllFields(settings);
		}
		catch (Exception e) {
			if (options.output)
				output.close();
			throw e;
		}
	}

	class RowCount {
		int numFields;
		long firstRow;
		long count;

		RowCount(final int numFields, final long firstRow) {
			this.numFields = numFields;
			this.firstRow = firstRow;
			this.count = 1;
		}

		RowCount inc() {
			this.count++;
			return this;
		}
	}

	private void processBulk(final CsvParserSettings settings) throws IOException, FTAPluginException, FTAUnsupportedLocaleException, FTAProcessingException {
		String[] header;
		int numFields;
		TextAnalyzer analyzer;
		TextAnalysisResult result;
        String previousKey = null;
        String key;
        String previousName = null;
        String name = null;
		final Map bulkMap = new HashMap<>();

		try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filename)), options.charset))) {
			final CsvParser parser = new CsvParser(settings);
			parser.beginParsing(in);

			header = parser.getRecordMetadata().headers();

			if (header.length != 4)
				throw new FTAProcessingException(filename,
						MessageFormat.format("Expected input with four columns (key,fieldName,fieldValue,fieldCount). {0} field(s) in input",
								 header.length));

			numFields = header.length;

			long thisRecord = 0;
			long totalCount = 0;
			String[] row;
			final Map errors = new HashMap<>();

			while ((row = parser.parseNext()) != null) {
				thisRecord++;
				if (row.length != numFields) {
					final RowCount existing = errors.get(row.length);
					if (existing == null)
						errors.put(row.length, new RowCount(row.length, thisRecord));
					else
						errors.put(row.length, existing.inc());
					continue;
				}
				key = row[0];
				name = row[1];
				final String fieldValue = row[2];
				final Long fieldCount = Long.valueOf(row[3].trim());
				totalCount += fieldCount;
				if (previousKey == null || !key.equals(previousKey)) {
					if (!bulkMap.isEmpty()) {
						analyzer = new TextAnalyzer(previousName);
						options.apply(analyzer);
						analyzer.trainBulk(bulkMap);
						analyzer.setTotalCount(totalCount);
						result = analyzer.getResult();
						output.printf("Field '%s' - %s%n", sanitize(analyzer.getStreamName()), result.asJSON(options.pretty, options.verbose));
						totalCount = 0;
					}
					bulkMap.clear();
					previousKey = key;
					previousName = name;
                }
                bulkMap.put(fieldValue, fieldCount);
			}

			if (!errors.isEmpty()) {
				for (final RowCount recordError : errors.values())
					error.printf("ERROR: File: '%s', %d records skipped with %d fields, first occurrence %d, expected %d%n",
							filename, recordError.count, recordError.numFields, recordError.firstRow, numFields);
			}

			if (!bulkMap.isEmpty()) {
				analyzer = new TextAnalyzer(name);
				options.apply(analyzer);
				analyzer.trainBulk(bulkMap);
				analyzer.setTotalCount(totalCount);
				result = analyzer.getResult();
				output.printf("Field '%s' - %s%n", sanitize(analyzer.getStreamName()), result.asJSON(options.pretty, options.verbose));
			}
		}
	}

	private String sanitize(final String input) {
		if (input == null || input.isEmpty())
			return input;

		final StringBuilder b = new StringBuilder();

		for (int i = 0; i < input.length(); i++) {
			if (input.charAt(i) == '\n')
				b.append("%0A");
			else if (input.charAt(i) == '\r')
				b.append("%0D");
			else
				b.append(input.charAt(i));
		}

		return b.toString();
	}

	private void processAllFields(final CsvParserSettings settings) throws IOException, FTAPluginException, FTAUnsupportedLocaleException, FTAProcessingException, FTAMergeException {
		final long startTime = System.currentTimeMillis();
		long initializedTime = -1;
		long consumedTime = -1;
		long resultsTime = -1;
		Processor processor = null;
		Processor altProcessor = null;
		String[] header = null;
		int numFields = 0;
		long rawRecordIndex = 0;
		final Map errors = new HashMap<>();

		try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filename)), options.charset))) {

			// Skip the first  lines if requested
			if (options.skip != 0) {
				for (int i = 0; i < options.skip; i++)
					in.readLine();
				rawRecordIndex += options.skip;
			}

			final CsvParser parser = new CsvParser(settings);
			parser.beginParsing(in);

			header = parser.getRecordMetadata().headers();
			if (header == null)
				throw new FTAProcessingException(filename, "Cannot parse header");

			numFields = header.length;
			if (options.col > numFields)
				throw new FTAProcessingException(filename, MessageFormat.format("Column {0} does not exist.  Only {1} field(s) in input.", options.col, numFields));

			for (int i = 0; i < numFields; i++) {
				if ((options.col == -1 || options.col == i) && options.verbose != 0 && options.noAnalysis)
					System.out.println(header[i]);
			}

			processor = new Processor(com.cobber.fta.core.Utils.getBaseName(Paths.get(filename).getFileName().toString()), header, options);
			if (options.testmerge)
				altProcessor = new Processor(com.cobber.fta.core.Utils.getBaseName(Paths.get(filename).getFileName().toString()), header, options);
			initializedTime = System.currentTimeMillis();

			final CircularBuffer buffer = new CircularBuffer(options.trailer + 1);

			String[] row;
			int processedRecords = 0;

			while ((row = parser.parseNext()) != null) {
				rawRecordIndex++;
				// Skip blank lines
				if (row.length == 1 && row[0] == null)
					continue;
				if (row.length != numFields) {
					final RowCount existing = errors.get(row.length);
					if (existing == null)
						errors.put(row.length, new RowCount(row.length, rawRecordIndex));
					else
						errors.put(row.length, existing.inc());
					continue;
				}
				buffer.add(row);
				if (!buffer.isFull())
					continue;
				row = buffer.get();
				processedRecords++;

				if (options.testmerge) {
					if (processedRecords % 2 == 0)
						processor.consume(row);
					else
						altProcessor.consume(row);
				}
				else
					processor.consume(row);

				if (processedRecords == options.recordsToProcess) {
					parser.stopParsing();
					break;
				}
			}
			consumedTime = System.currentTimeMillis();
		}
		catch (FileNotFoundException e) {
			throw new FTAProcessingException(filename, "File not found", e);
		}
		catch (TextParsingException|ArrayIndexOutOfBoundsException e) {
			throw new FTAProcessingException(filename, "Univocity exception", e);
		}

		if (options.testmerge)
			processor = Processor.merge(processor, altProcessor);

		if (!errors.isEmpty()) {
			long toSkip = -1;
			for (final RowCount recordError : errors.values()) {
				error.printf("ERROR: File: '%s', %d records skipped with %d fields, first occurrence %d, expected %d%n",
						filename, recordError.count, recordError.numFields, recordError.firstRow, numFields);
				if (rawRecordIndex > 20 && recordError.count > .8 * rawRecordIndex)
					toSkip = recordError.firstRow;
			}
			if (toSkip != -1)
				error.printf("ERROR: File: '%s', retry with --skip %d%n", filename, toSkip);
		}

		if (options.noAnalysis)
			System.exit(0);

		// Validate the result of the analysis if requested
		final int[] matched = new int[numFields];
		final int[] nulls = new int[numFields];
		final int[] blanks = new int[numFields];
		final Set failures = new HashSet<>();

		// Check the RegExp at level 2 validation
		if (options.validate == 2) {
			try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filename)), options.charset))) {

				final CsvParser parser = new CsvParser(settings);
				parser.beginParsing(in);
				numFields = parser.getRecordMetadata().headers().length;

				final TextAnalysisResult[] results = processor.getResult();
				final Pattern[] patterns = new Pattern[numFields];

				for (int i = 0; i < numFields; i++)
					if (options.col == -1 || options.col == i)
						patterns[i] = Pattern.compile(results[i].getRegExp());

				rawRecordIndex = 0;
				String[] row;

				while ((row = parser.parseNext()) != null) {
					rawRecordIndex++;
					if (row.length != numFields)
						continue;

					for (int i = 0; i < numFields; i++) {
						if (options.col == -1 || options.col == i) {
							final String value = row[i];
							if (value == null)
								nulls[i]++;
							else if (value.trim().isEmpty())
								blanks[i]++;
							else if (patterns[i].matcher(value).matches())
								matched[i]++;
							else if (options.verbose != 0)
								failures.add(value);
						}
					}
					if (rawRecordIndex == options.recordsToProcess) {
						parser.stopParsing();
						break;
					}
				}
			}
		}

		int typesDetected = 0;
		long matchCount = 0;
		long sampleCount = 0;
		TextAnalysisResult result = null;
		if (options.json)
			output.printf("[%n");
		final TextAnalysisResult[] results = processor.getResult();
		for (int i = 0; i < numFields; i++) {
			if (options.col == -1 || options.col == i) {
				final TextAnalyzer analyzer = processor.getAnalyzer(i);
				if (rawRecordIndex != options.recordsToProcess)
					analyzer.setTotalCount(rawRecordIndex);

				result = results[i];
				if (options.json) {
					if (i != 0 && options.col == -1)
						output.printf(",");
				}
				else
					output.printf("Field '%s' (%d) - ", sanitize(analyzer.getStreamName()), i);
				output.printf("%s%n", result.asJSON(options.pretty, options.verbose));

				if (options.pluginDefinition) {
					final ObjectNode pluginDefinition = result.asPlugin(analyzer);
					if (pluginDefinition != null) {
						final ObjectMapper mapper = new ObjectMapper();

						final ObjectWriter writer = mapper.writerWithDefaultPrettyPrinter();
						try {
							output.printf("%s%n", writer.writeValueAsString(pluginDefinition));
						} catch (JsonProcessingException e) {
							throw new FTAProcessingException(filename, "JsonProcessing exception", e);
						}
					}
				}
				if (result.getType() != null)
					typesDetected++;
				matchCount += result.getMatchCount();
				sampleCount += result.getSampleCount();

				// Check the counts if we are validating
				if (options.validate >= 1) {
					final String ret = result.checkCounts();
					if (ret != null)
						throw new FTAProcessingException(filename,
								MessageFormat.format("Composite: {0}, field: {1} ({2}), failed count validation - {3}",
								analyzer.getContext().getCompositeName(), analyzer.getContext().getStreamName(),
								analyzer.getContext().getStreamIndex(), ret));
				}

				if (options.validate == 2 && matched[i] != result.getMatchCount()) {
					if (result.isSemanticType())
						if (matched[i] > result.getMatchCount())
							error.printf("\t*** Warning: Match Count via RegExp (%d) > LogicalType match analysis (%d) ***%n", matched[i], result.getMatchCount());
						else
							error.printf("\t*** Error: Match Count via RegExp (%d) < LogicalType match analysis (%d) ***%n", matched[i], result.getMatchCount());
					else
						error.printf("\t*** Error: Match Count via RegExp (%d) does not match analysis (%d) ***%n", matched[i], result.getMatchCount());
					if (options.verbose != 0) {
						error.println("Failed to match:");
						for (final String failure : failures)
							error.println("\t" + failure);
					}
				}
			}
		}
		if (options.json)
			output.printf("]%n");
		resultsTime = System.currentTimeMillis();

	    final Runtime instance = Runtime.getRuntime();
		final double usedMemory = (instance.totalMemory() - instance.freeMemory()) / (1024 * 1024);
		final long durationTime = System.currentTimeMillis() - startTime;

		if (!options.json) {
			if (options.col == -1) {
				final double percentage = numFields == 0 ? 0 : ((double)typesDetected*100)/numFields;
				error.printf("Summary: File: %s, Types detected %d of %d (%.2f%%), Matched %d, Samples %d, Used Memory: %.2fMB.%n",
						filename, typesDetected, numFields, percentage, matchCount, sampleCount, usedMemory);
			}
			else {
				final double confidence = result == null ? 0 : result.getConfidence();
				error.printf("Summary: Type detected: %s, Matched %d, Samples %d (Confidence: %.2f%%), Used Memory: %.2fMB.%n",
						(typesDetected == 1 ? "yes" : "no"), matchCount,
						sampleCount, confidence*100, usedMemory);
			}
			error.printf("Execution time (#fields: %d, #records: %d): initialization: %dms, consumption: %dms, results: %dms, total: %dms%n",
					numFields, rawRecordIndex, initializedTime - startTime, consumedTime - initializedTime, resultsTime - consumedTime, durationTime);
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy