eu.openminted.uc.socialsciences.ner.util.MyStanfordTsvWriter Maven / Gradle / Ivy

Go to download
package eu.openminted.uc.socialsciences.ner.util;

import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;

import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import webanno.custom.NamedEntity;

/**
 * Writer for Custom tsv format to be used as input for Stanford CoreNLP NER
 * training.
 * This writer assumes that the data has been annotated with the type
 * webanno.custom.NamedEntity, which has two features: "entityType" and
 * "modifier". With the configuration parameters you can decide if you want to
 * include the modifiers in the output, or not.
 *
 * @author neumanmy
 */
//todo this will come in next uimafit release
//@MimeTypeCapability({ MimeTypes.TEXT_X_CONLL_2003 })
@TypeCapability(
		inputs = {
				"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
				"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
				"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
				"webanno.custom.NamedEntity" })
public class MyStanfordTsvWriter extends JCasFileWriter_ImplBase {

	private static final Logger logger = LogManager.getLogger(MyStanfordTsvWriter.class);

	/**
	 * Name of configuration parameter that defines the file name extension of
	 * the output file.
	 */
	public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION;
	@ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".tsv")
	private String filenameSuffix;

	/**
	 * Name of configuration parameter that specifies if subtypes of annotations
	 * should be used (i.e. more fine-grained).
	 */
	public static final String PARAM_USE_SUBTYPES = "useSubTypes";
	@ConfigurationParameter(name = PARAM_USE_SUBTYPES, mandatory = true, defaultValue = "false")
	private boolean useSubTypes;

	/**
	 * Name of configuration parameter that contains the character encoding used
	 * by the input files.
	 */
	public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING;
	@ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8")
	private String encoding;

	@Override
	public void initialize(UimaContext context) throws ResourceInitializationException {
		super.initialize(context);
	}

	@Override
	public void process(JCas aJCas) throws AnalysisEngineProcessException {
		logger.info("Starting processing JCas.");
		PrintWriter out = null;
		try {
			out = new PrintWriter(new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix),
					encoding));
			convert(aJCas, out);
			logger.info("Processing JCas finished.");
		} catch (Exception e) {
			throw new AnalysisEngineProcessException(e);
		} finally {
			closeQuietly(out);
		}
	}

	private void convert(JCas aJCas, PrintWriter aOut) {
		Type neType = JCasUtil.getType(aJCas, NamedEntity.class);

		Feature neValue = neType.getFeatureByBaseName("value");
		Feature neModifier = neType.getFeatureByBaseName("modifier");

		Map> idx = JCasUtil.indexCovered(aJCas, Sentence.class,
                NamedEntity.class);
		/*
		 * a custom IobEncoder that handles the mapping of the CAS annotations to IOB format
		 */
		MyIobEncoder encoder = new MyIobEncoder(aJCas.getCas(), neType, neValue, neModifier, useSubTypes);

		for (Sentence sentence : select(aJCas, Sentence.class)) {

			 /*
             * don't include sentence in temp file that contains no annotations
             *
             * (saves memory for training)
             */
            if (idx.get(sentence).isEmpty()) {
                continue;
            }

			HashMap ctokens = new LinkedHashMap<>();

			// Tokens
			List tokens = selectCovered(Token.class, sentence);

			for (Token token : tokens) {
				Row row = new Row();
				row.token = token;
				row.ne_val = encoder.encode(token);
				ctokens.put(row.token, row);
			}

			/*
			 * Write sentence in tsv format
			 * One token per line, tag in 2nd column. Sentences separated by
			 * empty line.
			 */
			for (Row row : ctokens.values()) {
				String label = row.ne_val;

				// write tab separated data
				aOut.printf("%s\t%s\n", row.token.getCoveredText(), label);
			}

			aOut.println();
		}
	}

	private static final class Row {
		Token token;
		String ne_val;
	}
}