de.citec.scie.Main Maven / Gradle / Ivy

Go to download
/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package de.citec.scie;

import de.citec.scie.descriptors.Error;
import de.citec.scie.ner.db.generic.DatabaseSingleton;
import de.citec.scie.ner.db.mapdb.MapDBDatabase;
import de.citec.scie.typesystem.Typesystem;
import static de.citec.scie.Constants.PROJECT_NAME;
import static de.citec.scie.Constants.TYPESYSTEM;
import java.io.File;
import java.io.FileFilter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UIMAException;
import org.apache.uima.fit.util.CasIOUtil;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;

/**
 * This is the command line interface for the SCIE project. It can be called in
 * two basic modes:
 * Annotation, which takes a .pdf file (or multiple .pdf files) as input
 * and returns the annotations as XCAS output
 * Training, which takes two directories (input and output). It expects
 * tuples of .rel and XCAS files in the input folder and writes the trained
 * annotator models to the output. Optionally you can specify even more details
 * about the training.
 *
 * @author Benjamin Paassen - [email protected]
 */
public class Main {

	private static List parseArgumentString(String str) {
		List list = new ArrayList<>();
		Matcher m = Pattern.compile("([^\"]\\S*|\".+?\")\\s*").matcher(str);
		while (m.find()) {
			list.add(m.group(1).replace("\"", ""));
		}
		return list;
	}

	private static String readLine(InputStream is) throws IOException {
		StringBuilder sb = new StringBuilder();
		int data;
		while ((data = is.read()) != -1) {
			if (data == '\n') {
				return sb.toString();
			}
			sb.append((char) data);
		}
		return null;
	}

	public static void main(String[] args) throws IOException {
		if (args.length < 1) {
			printHelp();
			return;
		}
		try {
			switch (args[0]) {
				case "--annotation": {
					if (args.length < 3) {
						throw new UnsupportedOperationException("Expected a database file and an input path as second argument!");
					}
					int i = 1;

					// Read the force tag
					boolean force = args[i].equals("--force") || args[i].equals("-f");
					if (force) {
						i++;
					}

					// Read the database file, open the database read only!
					File dbFile = new File(args[i++]);
					if (!dbFile.isFile()) {
						throw new UnsupportedOperationException("Database file " + dbFile.getPath() + " does not exist!");
					}
					DatabaseSingleton.initialize(new MapDBDatabase(dbFile, true, false));

					// Read the input files
					File input = new File(args[i++]);
					final PDFFilter pdfFilter = new PDFFilter();
					final File[] pdfs;
					if (input.isFile()) {
						if (!pdfFilter.accept(input)) {
							throw new UnsupportedOperationException("The given file is no PDF: " + input.getAbsolutePath());
						}
						pdfs = new File[]{input};
					} else {
						pdfs = input.listFiles(pdfFilter);
						if (pdfs == null || pdfs.length == 0) {
							throw new UnsupportedOperationException("The given directory contained no PDFs: " + input.getAbsolutePath());
						}
					}

					// Read the output directory
					final File outputFolder;
					if (args.length < i + 1) {
						outputFolder = new File(".");
						System.out.println("No output directory was given. Using " + outputFolder.getAbsolutePath() + " per default.");
					} else {
						outputFolder = new File(args[i++]);
						if (!outputFolder.isDirectory()) {
							throw new UnsupportedOperationException("The given output path is no directory: " + outputFolder.getAbsolutePath());
						}
						if (!outputFolder.exists()) {
							outputFolder.mkdirs();
							System.out.println("Creating directory: " + outputFolder.getAbsolutePath());
						}
					}

					// Start the annotation process
					annotate(pdfs, outputFolder, force);
					break;
				}
				case "--daemon": {
					if (args.length < 2) {
						throw new UnsupportedOperationException("Expected database file as second argument!");
					}

					// Read the database file, open the database read only!
					File dbFile = new File(args[1]);
					if (!dbFile.isFile()) {
						throw new UnsupportedOperationException("Database file "
								+ dbFile.getPath() + " does not exist!");
					}
					DatabaseSingleton.initialize(new MapDBDatabase(dbFile, true, false));

					// Run the interactive daemon
					runDaemon();
					break;
				}
				case "--training":
					if (args.length < 2) {
						throw new UnsupportedOperationException("Expected an input path as second argument!");
					}
					int j = 1;
					boolean force = args[j].equals("--force") || args[j].equals("-f");
					if (force) {
						j++;
					}
					File input = new File(args[j++]);
					final RelFilter relFilter = new RelFilter();
					final File[] relFiles;
					if (input.isDirectory()) {
						relFiles = input.listFiles(relFilter);
						if (relFiles == null || relFiles.length == 0) {
							throw new UnsupportedOperationException("The given directory contained no .rel-files: " + input.getAbsolutePath());
						}
					} else {
						throw new UnsupportedOperationException("Expecting a directory as input!");
					}
					final String classifierSpec;
					if (args.length < j + 1) {
						throw new UnsupportedOperationException("You did not specify (a) classifier(s) to train!");
					} else {
						classifierSpec = args[j++];
					}
					final File outputFolder;
					if (args.length < j + 1) {
						outputFolder = new File(".");
						System.out.println("No output directory was given. Using " + outputFolder.getAbsolutePath() + " per default.");
					} else {
						outputFolder = new File(args[j++]);
						if (!outputFolder.exists()) {
							outputFolder.mkdirs();
							System.out.println("Creating directory: " + outputFolder.getAbsolutePath());
						}
						if (!outputFolder.isDirectory()) {
							throw new UnsupportedOperationException("The given output path is no directory: " + outputFolder.getAbsolutePath());
						}
					}
					Training.train(relFiles, classifierSpec, outputFolder, force);
					break;
				case "--rankingTrain":
					if (args.length < 2) {
						throw new UnsupportedOperationException("Expected an input path as second argument!");
					} else {
						input = new File(args[1]);
						if (!input.isDirectory()) {
							throw new UnsupportedOperationException("Expecting a directory as input!");
						}
						if (!input.exists()) {
							throw new UnsupportedOperationException("No directory exists at given input path: " + input.getAbsolutePath());
						}
						rankingTrain(input);
					}
					break;
				default:
					printHelp();
					break;
			}
		} finally {
			DatabaseSingleton.uninitialize();
		}
		System.exit(0);
	}

	private static final class PDFFilter implements FileFilter {

		@Override
		public boolean accept(File file) {
			return file.getName().toLowerCase().endsWith(".pdf");
		}

	}

	private static final class RelFilter implements FileFilter {

		@Override
		public boolean accept(File file) {
			return file.getName().toLowerCase().endsWith(".rel");
		}

	}

	private static void runDaemon() throws IOException {
		final Annotator annotator = new Annotator();
		String cmd;
		while ((cmd = readLine(System.in)) != null) {
			if (cmd.startsWith("EXEC")) {
				List cmds = parseArgumentString(cmd);
				if (cmds.size() != 5) {
					System.err.println("Error: Invalid command token count");
					continue;
				}
				String type = cmds.get(1);
				if (!type.equals("PDF") && !type.equals("TXT")) {
					System.err.println("Error: Inalid input type " + type);
					continue;
				}
				String outputToken = cmds.get(2);
				File inFile = new File(cmds.get(3));
				File outFile = new File(cmds.get(4));

				// Reprint the parameters to stdout
				System.out.println("START " + type + " " + outputToken
						+ " \"" + inFile.getPath() + "\" "
						+ " \"" + outFile.getPath() + "\" ");

				// Perform the actual annotation
				try {
					CasIOUtil.writeXCas(annotator.annotateInputFile(inFile,
							type.equals("PDF")), outFile);
					System.out.println("DONE " + outputToken);
				} catch (UIMAException | IOException ex) {
					System.out.println("ERR " + outputToken);
					ex.printStackTrace(System.err);
				}
				System.out.flush();
			}
		}
	}

	private static void printHelp() {
		System.out.println("SCIE -- Spinal Cord Injury Ontology Extraction");
		System.out.println("Copyright (C) 2013, 2014\n");
		System.out.println("Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel");
		System.out.println("");
		System.out.println("You can use this program in three modes:");
		System.out.println("--annotation");
		System.out.println("and");
		System.out.println("--training");
		System.out.println("and");
		System.out.println("--rankingTrain");
		System.out.println("You must specify the mode as first command line argument.");
		System.out.println("");
		System.out.println("1.) Annotation Mode");
		System.out.println("");
		System.out.println("Command Line Usage:");
		System.out.println("java -jar " + PROJECT_NAME + ".jar --annotation [--force|-f]   [output_path]");
		System.out.println("where  either leads to a single PDF file "
				+ "or to a directory containing PDF files");
		System.out.println("Additionally you may set the 'force' flag such that "
				+ "an output file that is already present is overriden.");
		System.out.println("Additionally you may specify an output folder where "
				+ "the output is stored. Per default the current working "
				+ "directory is used.");
		System.out.println("");
		System.out.println("In Annotation mode the program will annotate the "
				+ "given PDF files and create an XCAS file containing "
				+ "annotations for each one.");
		System.out.println("");
		System.out.println("2.) Training Mode");
		System.out.println("");
		System.out.println("Command Line Usage:");
		System.out.println("java -jar " + PROJECT_NAME + ".jar --training [--force|-f]   [output_path]");
		System.out.println("where  leads to a directory containing "
				+ "tuples of .rel files and XCAS files.");
		System.out.println(" is a regular expression specifying"
				+ " the classifier(s) you want to train.");
		System.out.println("Additionally you may set the 'force' flag such that "
				+ "an output file that is already present is overriden.");
		System.out.println("Additionally you may specify an output folder where "
				+ "the output is stored. Per default the current working "
				+ "directory is used.");
		System.out.println("");
		System.out.println("In Training Mode the program will take the given "
				+ "tuples of .rel and XCAS file as training data and will "
				+ "retrain all classifiers using the given data. The training "
				+ "results are stored in 3 files for each classifier:");
		System.out.println("* a .model file containing the LibLinear parameters");
		System.out.println("* a .features file specifying the non-zero features");
		System.out.println("* an .eval file containing the evaluation results for the training of this classifier.");
		System.out.println("Note that we currently do not support setting "
				+ "training parameters using the command line. Unfortunately you "
				+ "still need to change the source code (in the Main class) "
				+ "to do that.");
		System.out.println("");
		System.out.println("3.) Ranking for Training Mode");
		System.out.println("");
		System.out.println("java -jar " + PROJECT_NAME + ".jar --rankingTrain ");
		System.out.println("where  leads to a directory containing "
				+ "XCAS files.");
		System.out.println("");
		System.out.println("In Ranking for Training Mode the programm will take the given"
				+ "XCAS files and calculate wich one should be annotated by hand and trained to the programm to improve"
				+ "the programm.");
	}

	private static void annotate(final File[] pdfs, final File outputFolder, final boolean force) {
		Annotator annotator = new Annotator();
		for (final File pdf : pdfs) {
			System.out.println("Processing file " + pdf.getAbsolutePath());
			try {
				// Make sure the output file is not overridden unless this is
				// explicitly wanted
				final File outputFile = new File(outputFolder,
						pdf.getName().replace(".pdf", ".xml"));
				if (outputFile.exists()) {
					if (!force) {
						System.out.println("Ignoring file because output file " + outputFile.getAbsolutePath() + " already exists!");
						continue;
					} else {
						outputFile.delete();
					}
				}

				// Perform the actual annotation
				CasIOUtil.writeXCas(annotator.annotateInputFile(pdf, true), outputFile);
			} catch (UIMAException | IOException ex) {
				System.err.println("The file could not be processed because of "
						+ "an exception during parsing:");
				ex.printStackTrace(System.err);
			}
		}
	}

	private static void rankingTrain(File input) {
		FilenameFilter filter = new FilenameFilter() {
			@Override
			public boolean accept(File dir, String name) {
				String lower = name.toLowerCase();
				return lower.endsWith(".xml");
			}
		};
		File[] xcasFiles = input.listFiles(filter);
		JCas jcas;
		ArrayList ranking = new ArrayList();
		for (File xcas : xcasFiles) {
			try {
				String[] entry = new String[3];
				entry[0] = xcas.getName();
				jcas = Typesystem.getJCas(TYPESYSTEM);
				CasIOUtil.readXCas(jcas, xcas);
				Collection errors = JCasUtil.select(jcas, Error.class);
				for (Error error : errors) {
					entry[1] = error.getDatabases();
					entry[2] = error.getRelations();
				}
				if (!entry[1].equals("") || ! !entry[2].equals("")) {
					ranking.add(entry);
				}
			} catch (IOException ex) {
				System.err.println("The file " + xcas.getName()
						+ " could not be processed because of an exception during parsing:");
				ex.printStackTrace(System.err);
			}
		}
		Collections.sort(ranking, new Comparator() {
			@Override
			public int compare(String[] error1, String[] error2) {
				int count1 = count(error1[1]);
				int count2 = count(error2[1]);
				if (count1 < count2) {
					return -1;
				} else if (count1 > count2) {
					return 1;
				} else {
					count1 = count(error1[2]);
					count2 = count(error2[2]);
					if (count1 < count2) {
						return -1;
					} else if (count1 > count2) {
						return 1;
					} else {
						return 0;
					}
				}
			}

			private int count(String type) {
				return type.split(";").length;
			}
		});
		System.out.println("Ranking of Files:");
		for (String[] out : ranking) {
			System.out.println(out[0]);
			if (!out[1].equals("")) {
				System.out.println("Update following databases: " + getErrors(out[1]));
			}
			if (!out[2].equals("")) {
				System.out.println("Annotate and train, because of missing relations: " + getErrors(out[2]));
			}
		}
	}

	private static String getErrors(String type) {
		return type.substring(0, type.length() - 1).replace(";", ", ");
	}
}