All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.cobber.fta.driver.Driver Maven / Gradle / Ivy

Go to download

Analyze Text data to determine simple type and Semantic type information as well as other key metrics associated with a text stream.

There is a newer version: 15.10.0
Show newest version
/*
 * Copyright 2017-2024 Tim Segall
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * Simple Driver to utilize the FTA framework.
 */
package com.cobber.fta.driver;

import java.io.IOException;
import java.io.PrintStream;
import java.text.DateFormatSymbols;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.text.NumberFormat;
import java.util.Calendar;
import java.util.Collection;
import java.util.GregorianCalendar;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;

import com.cobber.fta.LogicalType;
import com.cobber.fta.LogicalTypeFinite;
import com.cobber.fta.LogicalTypeInfinite;
import com.cobber.fta.LogicalTypeRegExp;
import com.cobber.fta.PluginDefinition;
import com.cobber.fta.PluginDocumentationEntry;
import com.cobber.fta.TextAnalyzer;
import com.cobber.fta.core.FTAPluginException;
import com.cobber.fta.core.FTAUnsupportedLocaleException;
import com.cobber.fta.core.Utils;
import com.cobber.fta.driver.faker.Faker;

public class Driver {
	private static DriverOptions cmdLineOptions;

	public static void main(final String[] args) throws IOException {
		final PrintStream output = System.out;
		final PrintStream error = System.err;
		boolean helpRequested = false;
		String replayFile = null;

		cmdLineOptions = new DriverOptions();
		final String[] unprocessed = cmdLineOptions.addFromStringArray(args);
		int idx = 0;
		if (unprocessed != null) {
			while (idx < unprocessed.length && unprocessed[idx].charAt(0) == '-') {
				if ("--createBloomfilter".equals(unprocessed[idx])) {
					DriverUtils.createBloomOutput(unprocessed[idx + 1], unprocessed[idx + 2]);
					System.exit(0);
				}
				else if ("--createNormalized".equals(unprocessed[idx])) {
					DriverUtils.createNormalizedOutput(unprocessed[idx + 1]);
					System.exit(0);
				}
				else if ("--createSemanticTypesMarkdown".equals(unprocessed[idx])) {
					DriverUtils.createSemanticTypesMarkdown();
					System.exit(0);
				}
				else if ("--help".equals(unprocessed[idx])) {
					error.println("Usage: fta [OPTIONS] file ...");
					error.println("Valid OPTIONS are:");
					error.println(" --abbreviationPunctuation - Disable NO_ABBREVIATION_PUNCTUATION mode");
					error.println(" --bulk - Enable bulk mode (input format = key,fieldName,fieldValue,fieldCount)");
					error.println(" --charset  - Use the supplied  to read the input files");
					error.println(" --col  - Only analyze column ");
					error.println(" --createBloomfilter   - Create Bloom Filter from CSV input, type: 'integer'|'string'");
					error.println(" --createNormalized  - Create Normalized output from CSV input");
					error.println(" --createSemanticTypesMarkdown - Create MarkDown documenting the Semantic Types supported");
					error.println(" --debug  - Set the debug level to ");
					error.println(" --delimiter  - Set the delimiter to the character ");
					error.println(" --detectWindow  - Set the size of the detect window to ");
					error.println(" --faker 
- Header is a comma separated list of Semantic Types"); error.println(" --formatDetection - Enable Format Detection"); error.println(" --help - Print this help"); error.println(" --json - Output as JSON"); error.println(" --knownTypes - Comma separated list of Semantic Types"); error.println(" --legacyJSON - Output legacy JSON - compatible with FTA 11.X or lower"); error.println(" --locale - Locale to use as opposed to default"); error.println(" --maxCardinality - Set the size of the Maximum Cardinality set supported"); error.println(" --maxInputLength - Set the Maximum Input length supported"); error.println(" --maxOutlierCardinality - Set the size of the Maximum Outlier Cardinality set supported"); error.println(" --noAnalysis - Do not do analysis"); error.println(" --noPretty - Do not pretty print analysis"); error.println(" --noQuantiles - Do not track quantiles"); error.println(" --noSemanticTypes - Do not register any Semantic Types"); error.println(" --noStatistics - Do not track statistics"); error.println(" --pluginDefinition - Output the plugin definitions from the training data set"); error.println(" --pluginMode true|false - Set the detect mode when running Plugin validate"); error.println(" --pluginName - Use supplied Plugin to generate samples or a signature (record count based on --records)"); error.println(" --pluginThreshold - Set the plugin threshold percentage (0-100) for detection"); error.println(" --quoteChar - Set the quote character to "); error.println(" --records - The number of records to analyze"); error.println(" --replay .fta - Replay the FTA trace file"); error.println(" --resolutionMode - Auto DayFirst or MonthFirst is determined from Locale"); error.println(" --samples - If set then generate samples (see --faker for comprehensive support)"); error.println(" --semanticType - Can be inline or as a File"); error.println(" --signature - Output the Signature for the supplied pluginName"); error.println(" --skip - Skip the initial rows of the input"); error.println(" --threshold - Set the threshold percentage (0-100) for detection"); error.println(" --trace - Set trace options"); error.println(" --trailer - Skip the final rows of the input"); error.println(" --validate - Set the validations level to , 1 == counts, 2 == regExp"); error.println(" --verbose - Output each record as it is processed"); error.println(" --xMaxCharsPerColumn - Set the maximum column width (CSV parsing option)"); error.println(" --xMaxColumns - Set the maximum number of columns (CSV parsing option - default 1024)"); helpRequested = true; } else if ("--replay".equals(unprocessed[idx])) replayFile = unprocessed[++idx]; else if ("--version".equals(unprocessed[idx])) { error.printf("%s%n", Utils.getVersion()); System.exit(0); } else { error.printf("ERROR: Unrecognized option: '%s', use --help%n", unprocessed[idx]); System.exit(1); } idx++; } } // Are we are replaying a trace file? if (replayFile != null) { Replay.replay(replayFile, cmdLineOptions); System.exit(0); } // Are we generating a signature? if (cmdLineOptions.signature) { final TextAnalyzer analyzer = TextAnalyzer.getDefaultAnalysis(cmdLineOptions.locale); final LogicalType logical = DriverUtils.getLogicalType(analyzer, cmdLineOptions.pluginName); error.println(logical.getSignature()); System.exit(0); } // Are we generating synthetic data? if (cmdLineOptions.faker != null) { final Faker faker = new Faker(cmdLineOptions, output, error); faker.fake(); System.exit(0); } // Are we generating all samples? if (cmdLineOptions.samples) { final long ouputRecords = cmdLineOptions.recordsToProcess == -1 ? 20 : cmdLineOptions.recordsToProcess; final TextAnalyzer analyzer = TextAnalyzer.getDefaultAnalysis(cmdLineOptions.locale); final Collection registered = analyzer.getPlugins().getRegisteredSemanticTypes(); for (final LogicalType logical : registered) { try (PrintStream results = new PrintStream(logical.getSemanticType() + ".csv")) { if (logical instanceof LogicalTypeRegExp && !((LogicalTypeRegExp)logical).isRegExpComplete()) error.printf("ERROR: Semantic Type (%s) does implement LTRandom interface - however samples may not be useful.%n", logical.getSemanticType()); // Use the Semantic Type as a header! results.println(logical.getSemanticType()); for (long l = 0; l < ouputRecords; l++) results.printf("\"%s\"%n", logical.nextRandom()); } } System.exit(0); } if (helpRequested && cmdLineOptions.verbose != 0) { final TextAnalyzer analyzer = TextAnalyzer.getDefaultAnalysis(cmdLineOptions.locale); final Collection registered = analyzer.getPlugins().getRegisteredSemanticTypes(); final Set names = new TreeSet<>(); // Sort the registered plugins by the Semantic Type name for (final LogicalType logical : registered) names.add(logical.getSemanticType()); if (!registered.isEmpty()) { error.println("\nRegistered Semantic Types:"); for (final String name : names) { final LogicalType logical = analyzer.getPlugins().getRegistered(name); if (logical instanceof LogicalTypeFinite) { final LogicalTypeFinite finite = (LogicalTypeFinite)logical; error.printf("\t%s (Finite): Priority: %d, Cardinality: %d, MaxLength: %d, MinLength: %d", logical.getSemanticType(), logical.getPriority(), finite.getSize(), finite.getMaxLength(), finite.getMinLength()); } else if (logical instanceof LogicalTypeInfinite) error.printf("\t%s (Infinite): Priority: %d", logical.getSemanticType(), logical.getPriority()); else { error.printf("\t%s (RegExp): Priority: %d, RegExp: '%s'", logical.getSemanticType(), logical.getPriority(), logical.getRegExp()); } final PluginDefinition defn = logical.getPluginDefinition(); error.printf(", Locales: '%s'%n\t\t%s%n", defn.getLocaleDescription(), logical.getDescription()); if (defn.documentation != null) { for (final PluginDocumentationEntry entry : defn.documentation) System.err.printf("\t\t-> \"%s\": \"%s\"%n", entry.source, entry.reference); } } } final Locale locale = cmdLineOptions.locale == null ? Locale.getDefault() : cmdLineOptions.locale; final DateFormatSymbols dfs = DateFormatSymbols.getInstance(locale); final GregorianCalendar cal = (GregorianCalendar) Calendar.getInstance(locale); final int actualMonths = cal.getActualMaximum(Calendar.MONTH); error.printf("%nLocale: '%s'%n", locale.toLanguageTag()); error.printf("\tMonths: "); final String[] months = dfs.getMonths(); for (int i = 0; i <= actualMonths; i++) { error.printf("%s (%S)", months[i], months[i].toUpperCase(locale)); if (i != actualMonths) error.printf(", "); } error.printf("%n\tShort Months: "); final String[] shortMonths = dfs.getShortMonths(); for (int i = 0; i <= actualMonths; i++) { error.printf("%s", shortMonths[i]); if (i != actualMonths) error.printf(", "); } final String[] amPmStrings = dfs.getAmPmStrings(); error.printf("%n\tAM/PM: %s, %s%n", amPmStrings[0], amPmStrings[1]); final DecimalFormat formatter = (DecimalFormat) NumberFormat.getInstance(locale); final DecimalFormatSymbols symbols = formatter.getDecimalFormatSymbols(); error.printf("\tDecimal Separator: '%c'%n", symbols.getDecimalSeparator()); error.printf("\tGrouping Separator: '%c'%n", symbols.getGroupingSeparator()); error.printf("\tMinus Sign: '%c'%n", symbols.getMinusSign()); error.printf("\tExponent Separator: '%s'%n", symbols.getExponentSeparator()); final NumberFormat simple = NumberFormat.getNumberInstance(locale); if (simple instanceof DecimalFormat) error.printf("\tNegative Prefix: '%s'%n", ((DecimalFormat) simple).getNegativePrefix()); System.exit(0); } if (unprocessed == null || idx == unprocessed.length) { error.printf("ERROR: No file to process supplied, use --help%n"); System.exit(1); } // Loop over all the file arguments while (idx < unprocessed.length) { final String filename = unprocessed[idx++]; final FileProcessor fileProcessor = new FileProcessor(System.err, filename, cmdLineOptions); try { fileProcessor.process(); } catch (FTAPluginException e) { error.printf("ERROR: Plugin Exception: %s%n", e.getMessage()); System.exit(1); } catch (FTAUnsupportedLocaleException e) { final Locale activeLocale = cmdLineOptions.locale != null ? cmdLineOptions.locale : Locale.getDefault(); error.printf("ERROR: Unsupported Locale: %s, error: %s%n", activeLocale.toLanguageTag(), e.getMessage()); System.exit(1); } catch (FTAProcessingException e) { final String message = cmdLineOptions.verbose != 0 && e.getCause() != null ? e.getCause().getMessage() : e.getMessage(); error.printf("ERROR: Filename: %s, error: %s%n", e.getFilename(), message); } catch (Throwable t) { error.printf("ERROR: '%s' error: %s%n", filename, t.getMessage()); t.printStackTrace(error); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy