All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.openminted.uc.socialsciences.ner.util.BinaryCasToTsvConverter Maven / Gradle / Ivy

The newest version!
package eu.openminted.uc.socialsciences.ner.util;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline;

import java.io.IOException;

import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.uima.UIMAException;
import org.kohsuke.args4j.Option;
import org.kohsuke.args4j.spi.BooleanOptionHandler;
import org.kohsuke.args4j.spi.StringOptionHandler;

import de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasReader;
import eu.openminted.uc.socialsciences.common.CommandLineArgumentHandler;

public class BinaryCasToTsvConverter {

	private static final Logger logger = LogManager.getLogger(BinaryCasToTsvConverter.class);

	private static final String DEFAULT_OUTPUT = "stanfordTrain.tsv";
	private static final String DEFAULT_FILE_FORMAT = ".ser";

	@Option(name = "-i", handler = StringOptionHandler.class,
			usage = "input directory containing binary CAS files to be converted", required = true)
	private String inputPath = null;

    @Option(name = "-o", usage = "[optional] path to save the converted file to. Default values is '"
            + DEFAULT_OUTPUT + "'")
	private String outputPath = DEFAULT_OUTPUT;

	@Option(name = "-subtypes", handler = BooleanOptionHandler.class,
			usage = "[optional] useSubTypes flag. If set, value and modifier of an annotation will "
			        + "be attached to create more fine-grained classes. If not set, only the "
			        + "coarse-grained annotation (e.g. LOC, PER, etc.) value will be exported.")
	private boolean useSubTypes = false;
	
    @Option(name = "-f", usage = "[optional] specify the file extension of the input files. Default value is '"
            + DEFAULT_FILE_FORMAT + "'")
	private String fileFormat = DEFAULT_FILE_FORMAT;

	public static void main(String[] args) {
		new BinaryCasToTsvConverter().run(args);
	}

	public void run()
	{
		assertFields();
		runInternal();
	}

	private void assertFields() {
		if(inputPath==null)
			throw new IllegalArgumentException("inputPath can not be null!");
		if(outputPath==null)
			throw new IllegalArgumentException("outputPath can not be null!");
	}

	private void run(String[] args) {
		new CommandLineArgumentHandler().parseInput(args, this);

		runInternal();
	}

	private void runInternal() {
		logger.info("Reading input files from [" + inputPath + "]");

		if (outputPath.equals(DEFAULT_OUTPUT)) {
			logger.info("No path for saving the tsv files was specified. Default value will be used.");
		}
		logger.info("Will write the output to [" + outputPath + "]");

		try {
			runPipeline(
					createReaderDescription(BinaryCasReader.class,
							BinaryCasReader.PARAM_SOURCE_LOCATION, inputPath,
							BinaryCasReader.PARAM_PATTERNS, "/**/*" + fileFormat),
					createEngineDescription(MyStanfordTsvWriter.class,
							MyStanfordTsvWriter.PARAM_TARGET_LOCATION, outputPath,
							MyStanfordTsvWriter.PARAM_USE_SUBTYPES, useSubTypes,
							MyStanfordTsvWriter.PARAM_SINGULAR_TARGET, true, MyStanfordTsvWriter.PARAM_OVERWRITE,
							true));

			logger.info("Process complete.");
		} catch (UIMAException | IOException e) {
			e.printStackTrace();
		}
	}

	public void setInputPath(String inputPath)
	{
		this.inputPath = inputPath;
	}
	public String getInputPath()
	{
		return inputPath;
	}
	public void setOutputPath(String outputPath)
	{
		this.outputPath = outputPath;
	}
	public String getOutputPath()
	{
		return outputPath;
	}
	public void setUseSubTypes(boolean value)
	{
		useSubTypes = value;
	}
	public boolean isUseSubTypes()
	{
		return useSubTypes;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy