de.citec.scie.Main Maven / Gradle / Ivy
/*
* SCIE -- Spinal Cord Injury Information Extraction
* Copyright (C) 2013, 2014
* Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package de.citec.scie;
import de.citec.scie.descriptors.Error;
import de.citec.scie.ner.db.generic.DatabaseSingleton;
import de.citec.scie.ner.db.mapdb.MapDBDatabase;
import de.citec.scie.typesystem.Typesystem;
import static de.citec.scie.Constants.PROJECT_NAME;
import static de.citec.scie.Constants.TYPESYSTEM;
import java.io.File;
import java.io.FileFilter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UIMAException;
import org.apache.uima.fit.util.CasIOUtil;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
/**
* This is the command line interface for the SCIE project. It can be called in
* two basic modes:
* - Annotation, which takes a .pdf file (or multiple .pdf files) as input
* and returns the annotations as XCAS output
* - Training, which takes two directories (input and output). It expects
* tuples of .rel and XCAS files in the input folder and writes the trained
* annotator models to the output. Optionally you can specify even more details
* about the training.
*
* @author Benjamin Paassen - [email protected]
*/
public class Main {
private static List parseArgumentString(String str) {
List list = new ArrayList<>();
Matcher m = Pattern.compile("([^\"]\\S*|\".+?\")\\s*").matcher(str);
while (m.find()) {
list.add(m.group(1).replace("\"", ""));
}
return list;
}
private static String readLine(InputStream is) throws IOException {
StringBuilder sb = new StringBuilder();
int data;
while ((data = is.read()) != -1) {
if (data == '\n') {
return sb.toString();
}
sb.append((char) data);
}
return null;
}
public static void main(String[] args) throws IOException {
if (args.length < 1) {
printHelp();
return;
}
try {
switch (args[0]) {
case "--annotation": {
if (args.length < 3) {
throw new UnsupportedOperationException("Expected a database file and an input path as second argument!");
}
int i = 1;
// Read the force tag
boolean force = args[i].equals("--force") || args[i].equals("-f");
if (force) {
i++;
}
// Read the database file, open the database read only!
File dbFile = new File(args[i++]);
if (!dbFile.isFile()) {
throw new UnsupportedOperationException("Database file " + dbFile.getPath() + " does not exist!");
}
DatabaseSingleton.initialize(new MapDBDatabase(dbFile, true, false));
// Read the input files
File input = new File(args[i++]);
final PDFFilter pdfFilter = new PDFFilter();
final File[] pdfs;
if (input.isFile()) {
if (!pdfFilter.accept(input)) {
throw new UnsupportedOperationException("The given file is no PDF: " + input.getAbsolutePath());
}
pdfs = new File[]{input};
} else {
pdfs = input.listFiles(pdfFilter);
if (pdfs == null || pdfs.length == 0) {
throw new UnsupportedOperationException("The given directory contained no PDFs: " + input.getAbsolutePath());
}
}
// Read the output directory
final File outputFolder;
if (args.length < i + 1) {
outputFolder = new File(".");
System.out.println("No output directory was given. Using " + outputFolder.getAbsolutePath() + " per default.");
} else {
outputFolder = new File(args[i++]);
if (!outputFolder.isDirectory()) {
throw new UnsupportedOperationException("The given output path is no directory: " + outputFolder.getAbsolutePath());
}
if (!outputFolder.exists()) {
outputFolder.mkdirs();
System.out.println("Creating directory: " + outputFolder.getAbsolutePath());
}
}
// Start the annotation process
annotate(pdfs, outputFolder, force);
break;
}
case "--daemon": {
if (args.length < 2) {
throw new UnsupportedOperationException("Expected database file as second argument!");
}
// Read the database file, open the database read only!
File dbFile = new File(args[1]);
if (!dbFile.isFile()) {
throw new UnsupportedOperationException("Database file "
+ dbFile.getPath() + " does not exist!");
}
DatabaseSingleton.initialize(new MapDBDatabase(dbFile, true, false));
// Run the interactive daemon
runDaemon();
break;
}
case "--training":
if (args.length < 2) {
throw new UnsupportedOperationException("Expected an input path as second argument!");
}
int j = 1;
boolean force = args[j].equals("--force") || args[j].equals("-f");
if (force) {
j++;
}
File input = new File(args[j++]);
final RelFilter relFilter = new RelFilter();
final File[] relFiles;
if (input.isDirectory()) {
relFiles = input.listFiles(relFilter);
if (relFiles == null || relFiles.length == 0) {
throw new UnsupportedOperationException("The given directory contained no .rel-files: " + input.getAbsolutePath());
}
} else {
throw new UnsupportedOperationException("Expecting a directory as input!");
}
final String classifierSpec;
if (args.length < j + 1) {
throw new UnsupportedOperationException("You did not specify (a) classifier(s) to train!");
} else {
classifierSpec = args[j++];
}
final File outputFolder;
if (args.length < j + 1) {
outputFolder = new File(".");
System.out.println("No output directory was given. Using " + outputFolder.getAbsolutePath() + " per default.");
} else {
outputFolder = new File(args[j++]);
if (!outputFolder.exists()) {
outputFolder.mkdirs();
System.out.println("Creating directory: " + outputFolder.getAbsolutePath());
}
if (!outputFolder.isDirectory()) {
throw new UnsupportedOperationException("The given output path is no directory: " + outputFolder.getAbsolutePath());
}
}
Training.train(relFiles, classifierSpec, outputFolder, force);
break;
case "--rankingTrain":
if (args.length < 2) {
throw new UnsupportedOperationException("Expected an input path as second argument!");
} else {
input = new File(args[1]);
if (!input.isDirectory()) {
throw new UnsupportedOperationException("Expecting a directory as input!");
}
if (!input.exists()) {
throw new UnsupportedOperationException("No directory exists at given input path: " + input.getAbsolutePath());
}
rankingTrain(input);
}
break;
default:
printHelp();
break;
}
} finally {
DatabaseSingleton.uninitialize();
}
System.exit(0);
}
private static final class PDFFilter implements FileFilter {
@Override
public boolean accept(File file) {
return file.getName().toLowerCase().endsWith(".pdf");
}
}
private static final class RelFilter implements FileFilter {
@Override
public boolean accept(File file) {
return file.getName().toLowerCase().endsWith(".rel");
}
}
private static void runDaemon() throws IOException {
final Annotator annotator = new Annotator();
String cmd;
while ((cmd = readLine(System.in)) != null) {
if (cmd.startsWith("EXEC")) {
List cmds = parseArgumentString(cmd);
if (cmds.size() != 5) {
System.err.println("Error: Invalid command token count");
continue;
}
String type = cmds.get(1);
if (!type.equals("PDF") && !type.equals("TXT")) {
System.err.println("Error: Inalid input type " + type);
continue;
}
String outputToken = cmds.get(2);
File inFile = new File(cmds.get(3));
File outFile = new File(cmds.get(4));
// Reprint the parameters to stdout
System.out.println("START " + type + " " + outputToken
+ " \"" + inFile.getPath() + "\" "
+ " \"" + outFile.getPath() + "\" ");
// Perform the actual annotation
try {
CasIOUtil.writeXCas(annotator.annotateInputFile(inFile,
type.equals("PDF")), outFile);
System.out.println("DONE " + outputToken);
} catch (UIMAException | IOException ex) {
System.out.println("ERR " + outputToken);
ex.printStackTrace(System.err);
}
System.out.flush();
}
}
}
private static void printHelp() {
System.out.println("SCIE -- Spinal Cord Injury Ontology Extraction");
System.out.println("Copyright (C) 2013, 2014\n");
System.out.println("Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel");
System.out.println("");
System.out.println("You can use this program in three modes:");
System.out.println("--annotation");
System.out.println("and");
System.out.println("--training");
System.out.println("and");
System.out.println("--rankingTrain");
System.out.println("You must specify the mode as first command line argument.");
System.out.println("");
System.out.println("1.) Annotation Mode");
System.out.println("");
System.out.println("Command Line Usage:");
System.out.println("java -jar " + PROJECT_NAME + ".jar --annotation [--force|-f] [output_path]");
System.out.println("where either leads to a single PDF file "
+ "or to a directory containing PDF files");
System.out.println("Additionally you may set the 'force' flag such that "
+ "an output file that is already present is overriden.");
System.out.println("Additionally you may specify an output folder where "
+ "the output is stored. Per default the current working "
+ "directory is used.");
System.out.println("");
System.out.println("In Annotation mode the program will annotate the "
+ "given PDF files and create an XCAS file containing "
+ "annotations for each one.");
System.out.println("");
System.out.println("2.) Training Mode");
System.out.println("");
System.out.println("Command Line Usage:");
System.out.println("java -jar " + PROJECT_NAME + ".jar --training [--force|-f] [output_path]");
System.out.println("where leads to a directory containing "
+ "tuples of .rel files and XCAS files.");
System.out.println(" is a regular expression specifying"
+ " the classifier(s) you want to train.");
System.out.println("Additionally you may set the 'force' flag such that "
+ "an output file that is already present is overriden.");
System.out.println("Additionally you may specify an output folder where "
+ "the output is stored. Per default the current working "
+ "directory is used.");
System.out.println("");
System.out.println("In Training Mode the program will take the given "
+ "tuples of .rel and XCAS file as training data and will "
+ "retrain all classifiers using the given data. The training "
+ "results are stored in 3 files for each classifier:");
System.out.println("* a .model file containing the LibLinear parameters");
System.out.println("* a .features file specifying the non-zero features");
System.out.println("* an .eval file containing the evaluation results for the training of this classifier.");
System.out.println("Note that we currently do not support setting "
+ "training parameters using the command line. Unfortunately you "
+ "still need to change the source code (in the Main class) "
+ "to do that.");
System.out.println("");
System.out.println("3.) Ranking for Training Mode");
System.out.println("");
System.out.println("java -jar " + PROJECT_NAME + ".jar --rankingTrain ");
System.out.println("where leads to a directory containing "
+ "XCAS files.");
System.out.println("");
System.out.println("In Ranking for Training Mode the programm will take the given"
+ "XCAS files and calculate wich one should be annotated by hand and trained to the programm to improve"
+ "the programm.");
}
private static void annotate(final File[] pdfs, final File outputFolder, final boolean force) {
Annotator annotator = new Annotator();
for (final File pdf : pdfs) {
System.out.println("Processing file " + pdf.getAbsolutePath());
try {
// Make sure the output file is not overridden unless this is
// explicitly wanted
final File outputFile = new File(outputFolder,
pdf.getName().replace(".pdf", ".xml"));
if (outputFile.exists()) {
if (!force) {
System.out.println("Ignoring file because output file " + outputFile.getAbsolutePath() + " already exists!");
continue;
} else {
outputFile.delete();
}
}
// Perform the actual annotation
CasIOUtil.writeXCas(annotator.annotateInputFile(pdf, true), outputFile);
} catch (UIMAException | IOException ex) {
System.err.println("The file could not be processed because of "
+ "an exception during parsing:");
ex.printStackTrace(System.err);
}
}
}
private static void rankingTrain(File input) {
FilenameFilter filter = new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
String lower = name.toLowerCase();
return lower.endsWith(".xml");
}
};
File[] xcasFiles = input.listFiles(filter);
JCas jcas;
ArrayList ranking = new ArrayList();
for (File xcas : xcasFiles) {
try {
String[] entry = new String[3];
entry[0] = xcas.getName();
jcas = Typesystem.getJCas(TYPESYSTEM);
CasIOUtil.readXCas(jcas, xcas);
Collection errors = JCasUtil.select(jcas, Error.class);
for (Error error : errors) {
entry[1] = error.getDatabases();
entry[2] = error.getRelations();
}
if (!entry[1].equals("") || ! !entry[2].equals("")) {
ranking.add(entry);
}
} catch (IOException ex) {
System.err.println("The file " + xcas.getName()
+ " could not be processed because of an exception during parsing:");
ex.printStackTrace(System.err);
}
}
Collections.sort(ranking, new Comparator() {
@Override
public int compare(String[] error1, String[] error2) {
int count1 = count(error1[1]);
int count2 = count(error2[1]);
if (count1 < count2) {
return -1;
} else if (count1 > count2) {
return 1;
} else {
count1 = count(error1[2]);
count2 = count(error2[2]);
if (count1 < count2) {
return -1;
} else if (count1 > count2) {
return 1;
} else {
return 0;
}
}
}
private int count(String type) {
return type.split(";").length;
}
});
System.out.println("Ranking of Files:");
for (String[] out : ranking) {
System.out.println(out[0]);
if (!out[1].equals("")) {
System.out.println("Update following databases: " + getErrors(out[1]));
}
if (!out[2].equals("")) {
System.out.println("Annotate and train, because of missing relations: " + getErrors(out[2]));
}
}
}
private static String getErrors(String type) {
return type.substring(0, type.length() - 1).replace(";", ", ");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy