All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.project.ttc.tools.cli.TermSuiteTerminoCLI Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright 2, 2015nership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package eu.project.ttc.tools.cli;

import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Lists;

import eu.project.ttc.engines.cleaner.TermProperty;
import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.engines.desc.TermSuiteCollection;
import eu.project.ttc.models.OccurrenceType;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.resources.MemoryTermIndexManager;
import eu.project.ttc.tools.TermSuitePipeline;
import eu.project.ttc.utils.TermUtils;

/**
 * Command line interface for the Terminology extraction (Spotter+Indexer) engines.
 * 
 * @author Damien Cram
 */
public class TermSuiteTerminoCLI {
	private static final Logger LOGGER = LoggerFactory.getLogger(TermSuiteTerminoCLI.class);

	/** Short usage description of the CLI */
	private static final String USAGE = "java [-DconfigFile=] -Xms1g -Xmx2g -cp termsuite-core-x.x.jar eu.project.ttc.tools.cli.TermSuiteTerminoCLI";

	/// Parameter names
	/** Name of the example limit parament */
	private static final String TEXT = "text";


	/** Name of the watch parameter */
	private static final String WATCH = "watch";

	/** Name of the corpus parameter */
	private static final String PATH_TO_CORPUS = "corpus-home";

	/** Name of the resource path parameter */
	private static final String PATH_TO_RESOURCE_PACK = "resource-pack";

	
	/** Name of the corpus format parameter */
	private static final String CORPUS_FORMAT = "corpus-format";
	
	/** Name of the parameter that must be set to the tt dir */
	public static final String P_TAGGER_HOME_DIRECTORY = "tagger-home";
	
	/** Name of the parameter that must be set to disable graphical variants */
	private static final String GRAPHICAL_SIMILARITY = "graphical-similarity-th";
    
	/** Name of the paramter that shows tree tagger tags**/
	private static final String SHOW_TAGGER_TAGS = "tags";

	/** Compost configuration parameters **/
	private static final String COMPOST_COEFF = "compost-coeff";
	private static final String COMPOST_MIN_COMPONENT_SIZE = "compost-min-component-size";
	private static final String COMPOST_MAX_COMPONENT_NUM = "compost-max-component-num";
	private static final String COMPOST_SIMILARITY_THRESHOLD = "compost-similarity-threshold";
	private static final String COMPOST_SCORE_THRESHOLD = "compost-score-threshold";

	/** deactivate the occurrences saving in memory while indexing **/
	private static final String NO_OCCURRENCE = "no-occurrence";

	/** MongoDB parameters **/
	private static final String MONGODB_STORE = "mongodb-store";
	private static final String MONGODB_SOFT_LINK = "json-mongodb-soft-link";

	/*
	 * The mongo db options
	 */
	private static Optional mongoStoreDBURL = Optional.absent();
	private static boolean mongoStoreSoftLinked = false;

	
	/** Mate tagger parameter **/
	private static final String MATE = "mate";

	/*
	 * With Mate
	 */
	private static enum Tagger{Mate, TreeTagger};
	
	/*
	 * Logging arguments
	 */
	private static final String DEBUG = "debug";
	private static final String TRACE = "trace";
	private static final String NO_LOGGING = "no-logging";
	
	/*
	 * Contextualizer
	 */
	private static final String CONTEXTUALIZE = "contextualize";
	private static final String CONTEXTUALIZE_ALL_TERMS = "contextualize-all-terms";
	private static final String CONTEXT_SCOPE = "context-scope";
	private static final String ALLOW_MWT_IN_CONTEXTS = "allow-mwts-in-contexts";

	

	
	/*
	 * Cleaning arguments
	 */
	private static final String CLEAN_THRESHOLD = "filter-th";
	private static final String CLEAN_TOP_N = "filter-top-n";
	private static final String CLEAN_PROPERTY = "filter-property";
	private static final String CLEAN_FILTER_VARIANTS = "filter-variants";
	
	/*
	 * Max size filtering
	 */
	private static final String PERIODIC_FILTER_PROPERTY = "periodic-filter-property";
	private static final String PERIODIC_FILTER_MAX_SIZE = "periodic-filter-max-size";
	
	// the tsv file path argument
	private static final String TSV = "tsv";
	private static final String TSV_PROPERTIES = "tsv-properties";
	private static final String TSV_VARIANT_SCORES = "tsv-show-scores";


	// the json file path argument
	private static final String JSON = "json";

	// the tbx file path argument
	private static final String TBX = "tbx";

	// the jsonCAS file path argument
	private static final String JSCASFILE = "jsonCasFile";

	// tagger argument
	private static Tagger tagger = Tagger.TreeTagger;

    private static String resourcePack = null;
    private static String corpusPath = null;
    private static Lang language = null;
    private static String encoding = "UTF-8";
//    private static String pipelineCRInputDirectory = null;
    private static String taggerHome = "";
    private static String inlineText = null;
	private static TermSuiteCollection corpusType = TermSuiteCollection.TXT;
	private static float graphicalSimilarityThreshold = 0.9f;

	/*
	 * contetxualizer
	 */
	private static boolean contextualize = false;
	private static boolean contextualizeAllTerms = false;
	private static boolean allowMWTInContexts = false;
	private static int contextScope = 3;


	/*
	 * Cleaning parameters
	 */
	private static Optional cleaningThreshold = Optional.of(2f);
	private static Optional cleaningTopN = Optional.absent();
	private static Optional cleaningProperty = Optional.of(TermProperty.WR_LOG);
	private static boolean keepVariantsWhileCleaning = true;
	
	/*
	 * Max size periodic filtering
	 */
	private static Optional periodicFilteringProperty = Optional.absent();
	private static int maxSizeFilteringMaxSize = 20000;
	
	
	/*
	 * Spotter params
	 */
	private static boolean spotWithOccurrences = true;
	
	
	
	/*
	 * Export params
	 */
	private static Optional tsvFile = Optional.absent();
	private static Optional tsvProperties = Optional.absent();
	private static boolean tsvShowVariantScores = false;
	
	private static Optional jsonFile = Optional.absent();
	private static Optional tbxFile = Optional.absent();

	private static Optional jsonCasFile = Optional.absent();
	

	/*
	 *  compost params
	 */
	private static Optional compostAlpha = Optional.absent();
	private static Optional compostBeta = Optional.absent();
	private static Optional compostGamma = Optional.absent();
	private static Optional compostDelta = Optional.absent();
	private static Optional compostMinComponentSize = Optional.absent();
	private static Optional compostMaxComponentNum = Optional.absent();
	private static Optional compostSimilarityThreshold = Optional.of(1f);
	private static Optional compostScoreThreshold = Optional.absent();

	/*
	 * Ouput and display params
	 */
	private static Optional watch = Optional.absent();

	/**
	 * Application entry point
	 * 
	 * @param args
	 *            Command line arguments
     * @throws UnsupportedEncodingException 
	 */
	public static void main(String[] args) throws UnsupportedEncodingException {
		File logDir = new File("logs");
		if(!logDir.exists()) 
			logDir.mkdir();
		String logPath = Paths.get("logs", "termsuite-" + new SimpleDateFormat("yyyyMMdd-HHmmss").format(new Date()) +".log").toAbsolutePath().toString();
		TermSuiteCLIUtils.logToFile(logPath);
		Stopwatch sw = Stopwatch.createStarted();
		LOGGER.info("Logging to {}", logPath);
		try {
			
			// usage
			// java -DconfigFile=myPropertiesFileName -Xms1g  -Xmx2g -cp ttc-term-suite-1.3.jar eu.project.ttc.tools.cli.TermSuiteSpotterCLI
			// if the option -DconfigFile is missing preferencesFileName is set to TermSuiteCLIUtils.USER_HOME+PREFERENCES_FILE_NAME
			// create the command line parser
			PosixParser parser = new PosixParser();

			// create the Options
			Options options = declareOptions();
			
			try {
				// Parse and set CL options
				CommandLine line = parser.parse(options, args, false);
				readArguments(line);
				if(line.hasOption(NO_LOGGING))
					TermSuiteCLIUtils.disableLogging();
				else if(line.hasOption(DEBUG))
					TermSuiteCLIUtils.setGlobalLogLevel("debug");
				else if(line.hasOption(TRACE))
					TermSuiteCLIUtils.setGlobalLogLevel("trace");
				else
					TermSuiteCLIUtils.setGlobalLogLevel("info");

				TermSuiteCLIUtils.logCommandLineOptions(line);
				
				TermSuitePipeline pipeline = TermSuitePipeline.create(language.getCode());
				
				if(isInlineMode()) {
					pipeline.setInlineString(inlineText);
				} else {
					pipeline.setCollection(corpusType, corpusPath, encoding);
				}

				// resource
				pipeline.setResourcePath(resourcePack);
				
				// mongodb
				if(mongoStoreDBURL.isPresent())
					pipeline.setMongoDBOccurrenceStore(mongoStoreDBURL.get());
				
				// tokenizer
				pipeline.aeWordTokenizer();
				
				// tagger
				if(tagger == Tagger.TreeTagger) 
					pipeline.setTreeTaggerHome(taggerHome)
						.aeTreeTagger();
				else if(tagger == Tagger.Mate) 
					pipeline.setMateModelPath(taggerHome)
						.aeMateTaggerLemmatizer();
				

				// Filter urlsFilter
				pipeline.aeUrlFilter();

				// stemmer
				pipeline.aeStemmer();
				
				// regex spotter
				pipeline.setSpotWithOccurrences(spotWithOccurrences);
				pipeline.aeRegexSpotter();

				//export Json CAS spotter
				if(jsonCasFile.isPresent())
					pipeline.haeJsonCasExporter(jsonCasFile.get());
				// filter stop words
				pipeline.aeStopWordsFilter();

				// specificity computer
				pipeline.aeSpecificityComputer();

					
				// compost (morphology)
				if(compostAlpha.isPresent()) 
					pipeline.setCompostCoeffs(compostAlpha.get(), compostBeta.get(), compostGamma.get(), compostDelta.get());
				if(compostMinComponentSize.isPresent()) 
					pipeline.setCompostMinComponentSize(compostMinComponentSize.get());
				if(compostMaxComponentNum.isPresent()) 
					pipeline.setCompostMaxComponentNum(compostMaxComponentNum.get());
				if(compostScoreThreshold.isPresent()) 
					pipeline.setCompostScoreThreshold(compostScoreThreshold.get());
				if(compostSimilarityThreshold.isPresent()) 
					pipeline.setCompostSegmentSimilarityThreshold(compostSimilarityThreshold.get());
				pipeline.aeCompostSplitter();
				
				// syntactic variant gathering
				pipeline.aeSyntacticVariantGatherer();

				// graphical variant gathering
				pipeline.setGraphicalVariantSimilarityThreshold(graphicalSimilarityThreshold);
				pipeline.aeGraphicalVariantGatherer();
				
				// filtering
				if(cleaningThreshold.isPresent()) {
					pipeline.setKeepVariantsWhileCleaning(keepVariantsWhileCleaning);
					pipeline.aeThresholdCleaner(
							cleaningProperty.get(), cleaningThreshold.get());
				} else if(cleaningTopN.isPresent()) {
					pipeline.setKeepVariantsWhileCleaning(keepVariantsWhileCleaning);
					pipeline.aeTopNCleaner(cleaningProperty.get(), cleaningTopN.get());
				}

				if(periodicFilteringProperty.isPresent())
					pipeline.aeMaxSizeThresholdCleaner(periodicFilteringProperty.get(), maxSizeFilteringMaxSize);
				
				// contextualize
				if(contextualize) {
					pipeline
						.setContextualizeCoTermsType(allowMWTInContexts ? OccurrenceType.ALL : OccurrenceType.SINGLE_WORD)
						.aeContextualizer(contextScope, contextualizeAllTerms);
					
				}
				
				// stats
				pipeline.haeCasStatCounter("at end of pipeline");

				
				
				// Export
				if(tsvFile.isPresent())  {
					if(tsvProperties.isPresent()) {
						pipeline.setTsvExportProperties(tsvProperties.get());
						pipeline.setTsvShowScores(tsvShowVariantScores);
					} else 
						pipeline.setTsvExportProperties(
							TermProperty.PILOT,
							TermProperty.FREQUENCY
						);
					pipeline.haeTsvExporter(tsvFile.get());

				}
				if(tbxFile.isPresent()) 
					pipeline.haeTbxExporter(tbxFile.get());
				if(jsonFile.isPresent())  {					
					pipeline.setExportJsonWithContext(contextualize);
					pipeline.setExportJsonWithOccurrences(true);
					if(mongoStoreSoftLinked)
						pipeline.linkMongoStore();
					pipeline.haeJsonExporter(jsonFile.get());
				}

				// run the pipeline
				final String termIndexName = "ScriptTermIndex_" + System.currentTimeMillis();
	            if(isInlineMode()) {
	            	LOGGER.info("Running TermSuite pipeline (inline mode)");
	            	JCas cas = JCasFactory.createJCas();
	            	cas.setDocumentText(inlineText);
	            	cas.setDocumentLanguage(language.getCode());
	            	pipeline.run(cas);
	            	System.err.flush();
	            	System.out.println("Term index: ");
					TermIndex index = MemoryTermIndexManager.getInstance().getIndex(termIndexName);
	            	TermUtils.showIndex(index, System.out, watch);
	            } else {
	            	LOGGER.info("Running TermSuite pipeline in corpus mode");
	            	pipeline.run();
	            	if(watch.isPresent()) 
	            		TermUtils.showIndex(
	            				MemoryTermIndexManager.getInstance().getIndex(termIndexName), 
	            				new PrintStream(System.err, true, "UTF-8"), 
	            				watch);
	            }
	            LOGGER.info("Script executed in " + sw.toString());
				
			} catch (ParseException e) {
				TermSuiteCLIUtils.printUsage(e, USAGE, options); 
			}

		} catch (Exception e) {
			e.printStackTrace(System.err);
            LOGGER.error(e.getMessage());
		}
	}

	private static boolean isInlineMode() {
		return inlineText != null && inlineText.trim().length() > 0;
	}

	private static Options declareOptions() {
		Options options = new Options();
		
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				NO_OCCURRENCE, 
				false, 
				"Deactivate the occurrence store in memory (recommended for big corpus).", 
				false));

		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				PERIODIC_FILTER_PROPERTY, 
				true, 
				"Activate a periodic cleaning of the on-going terminology by a given property.", 
				false));

		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				PERIODIC_FILTER_MAX_SIZE, 
				true, 
				"The maximum allowed size of the on-going terminology in memory.", 
				false));

		
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				MATE, 
				false, 
				"Use Mate tagger instead of TreeTagger.", 
				false));

		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				TEXT, 
				true, 
				"The text to analyze", 
				false));
		
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				COMPOST_MAX_COMPONENT_NUM, 
				true, 
				"The maximum number of components that a compound can have", 
				false));
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				COMPOST_MIN_COMPONENT_SIZE, 
				true, 
				"The minimum size allowed in a component", 
				false));
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				COMPOST_SCORE_THRESHOLD, 
				true, 
				"The segmentation score threshold of COMPOST algo.", 
				false));
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				COMPOST_SIMILARITY_THRESHOLD, 
				true, 
				"The segment similarity threshold above which an existing string in COMPOST index is considered as recognized.", 
				false));
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				COMPOST_COEFF, 
				true, 
				"COMPOST alpha, beta, gamma and delta parameters, separated with a hyphen \"-\". Sum must be 1", 
				false));

		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				NO_LOGGING, 
				false, 
				"Disable logging", 
				false));
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				DEBUG, 
				false, 
				"fine-grained logging", 
				false));
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				TRACE, 
				false, 
				"very fine grained logging", 
				false));
		

		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				CONTEXTUALIZE, 
				false, 
				"Enable the contextualizer. Compute a context vector for each SWT term.", 
				false));
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				CONTEXTUALIZE_ALL_TERMS, 
				false, 
				"Compute a context vector for MWTs too.", 
				false));
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				ALLOW_MWT_IN_CONTEXTS, 
				false, 
				"Allow to set MWTs as cooccurrences in context vectors.", 
				false));
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				CONTEXT_SCOPE, 
				true, 
				"The window size for term contexts capture", 
				false));

		
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				CORPUS_FORMAT, 
				true, 
				"The file format in the input corpus. txt and tei supported", 
				false));
		
		options.addOption(TermSuiteCLIUtils.createOption(
				"c", 
				PATH_TO_CORPUS, 
				true, 
				"Path to the corpus", 
				false));
		options.addOption(TermSuiteCLIUtils.createOption(
				"r", 
				PATH_TO_RESOURCE_PACK, 
				true, 
				"Path to the TermSuite resource pack", 
				false));
		options.addOption(TermSuiteCLIUtils.createOption(
				"l", 
				TermSuiteCLIUtils.P_LANGUAGE, 
				true, 
				"language of the input files: fr/en/etc.", 
				true));
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				TermSuiteCLIUtils.P_ENCODING, 
				true, 
				"encoding of the input files", 
				false));

		options.addOption(TermSuiteCLIUtils.createOption(
				"t",
				P_TAGGER_HOME_DIRECTORY,
				true, 
				"TreeTagger home directory or Mate model directory", 
				true));
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				GRAPHICAL_SIMILARITY, 
				false, 
				"The similarity threshold (a value between 0 and 1, 0.9 advised) for graphical variant gathering.", 
				false));
		options.addOption(TermSuiteCLIUtils.createOption(
				null, 
				SHOW_TAGGER_TAGS, 
				false, 
				"Show tree tagger tags", 
				false));
		
		options.addOption(
				null, 
				WATCH, 
				true,
				"Show infos about terms matching this string");

		options.addOption(
				null, 
				CLEAN_PROPERTY, 
				true,
				"The name of the term property used for cleaning filtering  the term index");
		options.addOption(
				null, 
				CLEAN_FILTER_VARIANTS, 
				false,
				"Also filter variants with terms.");
		
		options.addOption(
				null, 
				CLEAN_THRESHOLD, 
				true,
				"The filtering threshold");

		options.addOption(
				null, 
				CLEAN_TOP_N, 
				true,
				"The number of terms to keep after filtering");

		options.addOption(
				null, 
				TSV, 
				true,
				"The tsv file path where to export the term index");
		options.addOption(
				null, 
				TSV_PROPERTIES, 
				true,
				"comma-separated list of term properties to export as a column in TSV file");
		options.addOption(
				null, 
				TSV_VARIANT_SCORES, 
				false,
				"shows variant scores next to the \"V\" label");

		options.addOption(
				null, 
				TBX, 
				true,
				"The tbx file path where to export the term index");

		options.addOption(
				null, 
				JSON, 
				true,
				"The json file path where to export the term index");

		options.addOption(
				null,
				JSCASFILE,
				true,
				"The directory path where to export the TreeTagger token of each files give in entry of TermSuite in " +
						"Json Format");

		options.addOption(
				null,
				MONGODB_STORE,
				true,
				"The mongo db url of the database where to store the occurrences");

		options.addOption(
				null, 
				MONGODB_SOFT_LINK, 
				false,
				"shows variant scores next to the \"V\" label");


		return options;
	}

	public static void readArguments(CommandLine line) throws IOException {
		

		if(line.hasOption(NO_OCCURRENCE))
			spotWithOccurrences = false;
			
		inlineText = line.getOptionValue(TEXT);
		if(inlineText == null)
			inlineText = TermSuiteCLIUtils.readIn(encoding);
		
		corpusPath = line.getOptionValue(PATH_TO_CORPUS);
		if(inlineText == null && corpusPath == null)
			TermSuiteCLIUtils.exitWithErrorMessage("Either the argument --" + TEXT + " or --" + PATH_TO_CORPUS + " must be set.");
		
		resourcePack = line.getOptionValue(PATH_TO_RESOURCE_PACK);
		
		
		language = Lang.forName(line.getOptionValue(TermSuiteCLIUtils.P_LANGUAGE));
		
		encoding = line.getOptionValue(TermSuiteCLIUtils.P_ENCODING, "UTF-8");
		
		taggerHome = line.getOptionValue(P_TAGGER_HOME_DIRECTORY);
			
		if(line.hasOption(CORPUS_FORMAT)) {
			if(line.getOptionValue(CORPUS_FORMAT).equals(TermSuiteCollection.TEI.name())) {
				corpusType = TermSuiteCollection.TEI;
			} else if(line.getOptionValue(CORPUS_FORMAT).equals(TermSuiteCollection.TXT.name())) {
				corpusType = TermSuiteCollection.TXT;
			} else
				TermSuiteCLIUtils.exitWithErrorMessage("Unknown corpus format: " + line.getOptionValue(CORPUS_FORMAT) + ". Supported formats: " + Joiner.on(',').join(TermSuiteCollection.values()));
		}
//		pipelineCRInputDirectory = TermSuiteCLIUtils.getCorpusLanguagePath(corpusPath, language, corpusType.name().toLowerCase());
		

		if(line.hasOption(GRAPHICAL_SIMILARITY))
			graphicalSimilarityThreshold = Float.parseFloat(line.getOptionValue(GRAPHICAL_SIMILARITY));
			
		if(line.hasOption(COMPOST_MIN_COMPONENT_SIZE))
			compostMinComponentSize = Optional.of(Integer.parseInt(line.getOptionValue(COMPOST_MIN_COMPONENT_SIZE)));

		if(line.hasOption(COMPOST_MAX_COMPONENT_NUM))
			compostMaxComponentNum = Optional.of(Integer.parseInt(line.getOptionValue(COMPOST_MAX_COMPONENT_NUM)));
		
		if(line.hasOption(COMPOST_SCORE_THRESHOLD))
			compostScoreThreshold = Optional.of(Float.parseFloat(line.getOptionValue(COMPOST_SCORE_THRESHOLD)));

		if(line.hasOption(WATCH))
			watch = Optional.of(Pattern.compile(line.getOptionValue(WATCH)));

		if(line.hasOption(COMPOST_SIMILARITY_THRESHOLD))
			compostSimilarityThreshold = Optional.of(Float.parseFloat(line.getOptionValue(COMPOST_SIMILARITY_THRESHOLD)));
		

		if(line.hasOption(COMPOST_COEFF)) {
			List strings = Splitter.on('-').splitToList(line.getOptionValue(COMPOST_COEFF));
			compostAlpha = Optional.of(Float.parseFloat(strings.get(0)));
			compostBeta = Optional.of(Float.parseFloat(strings.get(1)));
			compostGamma = Optional.of(Float.parseFloat(strings.get(2)));
			compostDelta = Optional.of(Float.parseFloat(strings.get(3)));
			Preconditions.checkArgument(
					1.0f == compostAlpha.get() + compostBeta.get() + compostGamma.get() + compostDelta.get(),
					String.format("The sum of Compost coeffs must be 1 (%3.2f+%3.2f+%3.2f+%3.2f=%3.2f)",
							compostAlpha.get(),
							compostBeta.get(),
							compostGamma.get(),
							compostDelta.get(),
							compostAlpha.get() + compostBeta.get() + compostGamma.get() + compostDelta.get()
							)
					);
		}
		
		
		/*
		 * Contextualizer
		 */
		contextualize = line.hasOption(CONTEXTUALIZE);
		allowMWTInContexts = line.hasOption(ALLOW_MWT_IN_CONTEXTS);
		contextualizeAllTerms = line.hasOption(CONTEXTUALIZE_ALL_TERMS);
		if(line.hasOption(CONTEXT_SCOPE)) {
			contextScope = Integer.parseInt(line.getOptionValue(CONTEXT_SCOPE));
		}
		
		if(line.hasOption(CLEAN_THRESHOLD))
			cleaningThreshold = Optional.of(Float.parseFloat(line.getOptionValue(CLEAN_THRESHOLD)));

		if(line.hasOption(CLEAN_TOP_N)) 
			cleaningTopN = Optional.of(Integer.parseInt(line.getOptionValue(CLEAN_TOP_N)));

		if(line.hasOption(CLEAN_PROPERTY)) {
			cleaningProperty = Optional.of(TermProperty.forName(line.getOptionValue(CLEAN_PROPERTY)));
		}
		
		if(line.hasOption(CLEAN_FILTER_VARIANTS))
			keepVariantsWhileCleaning = false;

		if(line.hasOption(PERIODIC_FILTER_PROPERTY)) {
			periodicFilteringProperty = Optional.of(TermProperty.forName(line.getOptionValue(PERIODIC_FILTER_PROPERTY)));
			if(line.hasOption(PERIODIC_FILTER_MAX_SIZE))
				maxSizeFilteringMaxSize = Integer.parseInt(line.getOptionValue(PERIODIC_FILTER_MAX_SIZE).trim()); 
		}

		
		if(line.hasOption(TSV))
			tsvFile = Optional.of(line.getOptionValue(TSV));
		if(line.hasOption(TSV_PROPERTIES)) {
			List list = Lists.newArrayList();
			for(String pName:Splitter.on(",").split(line.getOptionValue(TSV_PROPERTIES))) {
				list.add(TermProperty.forName(pName));
			}
			TermProperty[] ary = new TermProperty[list.size()];
			tsvProperties = Optional.of(list.toArray(ary));
		}
		if(line.hasOption(TSV_VARIANT_SCORES))
			tsvShowVariantScores = true;

		if(line.hasOption(TBX))
			tbxFile = Optional.of(line.getOptionValue(TBX));
		if(line.hasOption(JSON))
			jsonFile = Optional.of(line.getOptionValue(JSON));

		if(line.hasOption(JSCASFILE))
			jsonCasFile = Optional.of(line.getOptionValue(JSCASFILE));
		
		if(line.hasOption(MATE))
			tagger =  Tagger.Mate;
		
		if(line.hasOption(MONGODB_STORE))
			mongoStoreDBURL = Optional.of(line.getOptionValue(MONGODB_STORE));
		
		if(line.hasOption(MONGODB_SOFT_LINK)) {
			Preconditions.checkArgument(line.hasOption(MONGODB_STORE), "The option %s requires the option %s", MONGODB_SOFT_LINK, MONGODB_STORE);
			mongoStoreSoftLinked = true;
		}

	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy