All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.citec.scie.Main Maven / Gradle / Ivy

/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package de.citec.scie;

import de.citec.scie.descriptors.Error;
import de.citec.scie.ner.db.generic.DatabaseSingleton;
import de.citec.scie.ner.db.mapdb.MapDBDatabase;
import de.citec.scie.typesystem.Typesystem;
import static de.citec.scie.Constants.PROJECT_NAME;
import static de.citec.scie.Constants.TYPESYSTEM;
import java.io.File;
import java.io.FileFilter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UIMAException;
import org.apache.uima.fit.util.CasIOUtil;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;

/**
 * This is the command line interface for the SCIE project. It can be called in
 * two basic modes:
 * 
  • Annotation, which takes a .pdf file (or multiple .pdf files) as input * and returns the annotations as XCAS output
  • *
  • Training, which takes two directories (input and output). It expects * tuples of .rel and XCAS files in the input folder and writes the trained * annotator models to the output. Optionally you can specify even more details * about the training.
* * @author Benjamin Paassen - [email protected] */ public class Main { private static List parseArgumentString(String str) { List list = new ArrayList<>(); Matcher m = Pattern.compile("([^\"]\\S*|\".+?\")\\s*").matcher(str); while (m.find()) { list.add(m.group(1).replace("\"", "")); } return list; } private static String readLine(InputStream is) throws IOException { StringBuilder sb = new StringBuilder(); int data; while ((data = is.read()) != -1) { if (data == '\n') { return sb.toString(); } sb.append((char) data); } return null; } public static void main(String[] args) throws IOException { if (args.length < 1) { printHelp(); return; } try { switch (args[0]) { case "--annotation": { if (args.length < 3) { throw new UnsupportedOperationException("Expected a database file and an input path as second argument!"); } int i = 1; // Read the force tag boolean force = args[i].equals("--force") || args[i].equals("-f"); if (force) { i++; } // Read the database file, open the database read only! File dbFile = new File(args[i++]); if (!dbFile.isFile()) { throw new UnsupportedOperationException("Database file " + dbFile.getPath() + " does not exist!"); } DatabaseSingleton.initialize(new MapDBDatabase(dbFile, true, false)); // Read the input files File input = new File(args[i++]); final PDFFilter pdfFilter = new PDFFilter(); final File[] pdfs; if (input.isFile()) { if (!pdfFilter.accept(input)) { throw new UnsupportedOperationException("The given file is no PDF: " + input.getAbsolutePath()); } pdfs = new File[]{input}; } else { pdfs = input.listFiles(pdfFilter); if (pdfs == null || pdfs.length == 0) { throw new UnsupportedOperationException("The given directory contained no PDFs: " + input.getAbsolutePath()); } } // Read the output directory final File outputFolder; if (args.length < i + 1) { outputFolder = new File("."); System.out.println("No output directory was given. Using " + outputFolder.getAbsolutePath() + " per default."); } else { outputFolder = new File(args[i++]); if (!outputFolder.isDirectory()) { throw new UnsupportedOperationException("The given output path is no directory: " + outputFolder.getAbsolutePath()); } if (!outputFolder.exists()) { outputFolder.mkdirs(); System.out.println("Creating directory: " + outputFolder.getAbsolutePath()); } } // Start the annotation process annotate(pdfs, outputFolder, force); break; } case "--daemon": { if (args.length < 2) { throw new UnsupportedOperationException("Expected database file as second argument!"); } // Read the database file, open the database read only! File dbFile = new File(args[1]); if (!dbFile.isFile()) { throw new UnsupportedOperationException("Database file " + dbFile.getPath() + " does not exist!"); } DatabaseSingleton.initialize(new MapDBDatabase(dbFile, true, false)); // Run the interactive daemon runDaemon(); break; } case "--training": if (args.length < 2) { throw new UnsupportedOperationException("Expected an input path as second argument!"); } int j = 1; boolean force = args[j].equals("--force") || args[j].equals("-f"); if (force) { j++; } File input = new File(args[j++]); final RelFilter relFilter = new RelFilter(); final File[] relFiles; if (input.isDirectory()) { relFiles = input.listFiles(relFilter); if (relFiles == null || relFiles.length == 0) { throw new UnsupportedOperationException("The given directory contained no .rel-files: " + input.getAbsolutePath()); } } else { throw new UnsupportedOperationException("Expecting a directory as input!"); } final String classifierSpec; if (args.length < j + 1) { throw new UnsupportedOperationException("You did not specify (a) classifier(s) to train!"); } else { classifierSpec = args[j++]; } final File outputFolder; if (args.length < j + 1) { outputFolder = new File("."); System.out.println("No output directory was given. Using " + outputFolder.getAbsolutePath() + " per default."); } else { outputFolder = new File(args[j++]); if (!outputFolder.exists()) { outputFolder.mkdirs(); System.out.println("Creating directory: " + outputFolder.getAbsolutePath()); } if (!outputFolder.isDirectory()) { throw new UnsupportedOperationException("The given output path is no directory: " + outputFolder.getAbsolutePath()); } } Training.train(relFiles, classifierSpec, outputFolder, force); break; case "--rankingTrain": if (args.length < 2) { throw new UnsupportedOperationException("Expected an input path as second argument!"); } else { input = new File(args[1]); if (!input.isDirectory()) { throw new UnsupportedOperationException("Expecting a directory as input!"); } if (!input.exists()) { throw new UnsupportedOperationException("No directory exists at given input path: " + input.getAbsolutePath()); } rankingTrain(input); } break; default: printHelp(); break; } } finally { DatabaseSingleton.uninitialize(); } System.exit(0); } private static final class PDFFilter implements FileFilter { @Override public boolean accept(File file) { return file.getName().toLowerCase().endsWith(".pdf"); } } private static final class RelFilter implements FileFilter { @Override public boolean accept(File file) { return file.getName().toLowerCase().endsWith(".rel"); } } private static void runDaemon() throws IOException { final Annotator annotator = new Annotator(); String cmd; while ((cmd = readLine(System.in)) != null) { if (cmd.startsWith("EXEC")) { List cmds = parseArgumentString(cmd); if (cmds.size() != 5) { System.err.println("Error: Invalid command token count"); continue; } String type = cmds.get(1); if (!type.equals("PDF") && !type.equals("TXT")) { System.err.println("Error: Inalid input type " + type); continue; } String outputToken = cmds.get(2); File inFile = new File(cmds.get(3)); File outFile = new File(cmds.get(4)); // Reprint the parameters to stdout System.out.println("START " + type + " " + outputToken + " \"" + inFile.getPath() + "\" " + " \"" + outFile.getPath() + "\" "); // Perform the actual annotation try { CasIOUtil.writeXCas(annotator.annotateInputFile(inFile, type.equals("PDF")), outFile); System.out.println("DONE " + outputToken); } catch (UIMAException | IOException ex) { System.out.println("ERR " + outputToken); ex.printStackTrace(System.err); } System.out.flush(); } } } private static void printHelp() { System.out.println("SCIE -- Spinal Cord Injury Ontology Extraction"); System.out.println("Copyright (C) 2013, 2014\n"); System.out.println("Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel"); System.out.println(""); System.out.println("You can use this program in three modes:"); System.out.println("--annotation"); System.out.println("and"); System.out.println("--training"); System.out.println("and"); System.out.println("--rankingTrain"); System.out.println("You must specify the mode as first command line argument."); System.out.println(""); System.out.println("1.) Annotation Mode"); System.out.println(""); System.out.println("Command Line Usage:"); System.out.println("java -jar " + PROJECT_NAME + ".jar --annotation [--force|-f] [output_path]"); System.out.println("where either leads to a single PDF file " + "or to a directory containing PDF files"); System.out.println("Additionally you may set the 'force' flag such that " + "an output file that is already present is overriden."); System.out.println("Additionally you may specify an output folder where " + "the output is stored. Per default the current working " + "directory is used."); System.out.println(""); System.out.println("In Annotation mode the program will annotate the " + "given PDF files and create an XCAS file containing " + "annotations for each one."); System.out.println(""); System.out.println("2.) Training Mode"); System.out.println(""); System.out.println("Command Line Usage:"); System.out.println("java -jar " + PROJECT_NAME + ".jar --training [--force|-f] [output_path]"); System.out.println("where leads to a directory containing " + "tuples of .rel files and XCAS files."); System.out.println(" is a regular expression specifying" + " the classifier(s) you want to train."); System.out.println("Additionally you may set the 'force' flag such that " + "an output file that is already present is overriden."); System.out.println("Additionally you may specify an output folder where " + "the output is stored. Per default the current working " + "directory is used."); System.out.println(""); System.out.println("In Training Mode the program will take the given " + "tuples of .rel and XCAS file as training data and will " + "retrain all classifiers using the given data. The training " + "results are stored in 3 files for each classifier:"); System.out.println("* a .model file containing the LibLinear parameters"); System.out.println("* a .features file specifying the non-zero features"); System.out.println("* an .eval file containing the evaluation results for the training of this classifier."); System.out.println("Note that we currently do not support setting " + "training parameters using the command line. Unfortunately you " + "still need to change the source code (in the Main class) " + "to do that."); System.out.println(""); System.out.println("3.) Ranking for Training Mode"); System.out.println(""); System.out.println("java -jar " + PROJECT_NAME + ".jar --rankingTrain "); System.out.println("where leads to a directory containing " + "XCAS files."); System.out.println(""); System.out.println("In Ranking for Training Mode the programm will take the given" + "XCAS files and calculate wich one should be annotated by hand and trained to the programm to improve" + "the programm."); } private static void annotate(final File[] pdfs, final File outputFolder, final boolean force) { Annotator annotator = new Annotator(); for (final File pdf : pdfs) { System.out.println("Processing file " + pdf.getAbsolutePath()); try { // Make sure the output file is not overridden unless this is // explicitly wanted final File outputFile = new File(outputFolder, pdf.getName().replace(".pdf", ".xml")); if (outputFile.exists()) { if (!force) { System.out.println("Ignoring file because output file " + outputFile.getAbsolutePath() + " already exists!"); continue; } else { outputFile.delete(); } } // Perform the actual annotation CasIOUtil.writeXCas(annotator.annotateInputFile(pdf, true), outputFile); } catch (UIMAException | IOException ex) { System.err.println("The file could not be processed because of " + "an exception during parsing:"); ex.printStackTrace(System.err); } } } private static void rankingTrain(File input) { FilenameFilter filter = new FilenameFilter() { @Override public boolean accept(File dir, String name) { String lower = name.toLowerCase(); return lower.endsWith(".xml"); } }; File[] xcasFiles = input.listFiles(filter); JCas jcas; ArrayList ranking = new ArrayList(); for (File xcas : xcasFiles) { try { String[] entry = new String[3]; entry[0] = xcas.getName(); jcas = Typesystem.getJCas(TYPESYSTEM); CasIOUtil.readXCas(jcas, xcas); Collection errors = JCasUtil.select(jcas, Error.class); for (Error error : errors) { entry[1] = error.getDatabases(); entry[2] = error.getRelations(); } if (!entry[1].equals("") || ! !entry[2].equals("")) { ranking.add(entry); } } catch (IOException ex) { System.err.println("The file " + xcas.getName() + " could not be processed because of an exception during parsing:"); ex.printStackTrace(System.err); } } Collections.sort(ranking, new Comparator() { @Override public int compare(String[] error1, String[] error2) { int count1 = count(error1[1]); int count2 = count(error2[1]); if (count1 < count2) { return -1; } else if (count1 > count2) { return 1; } else { count1 = count(error1[2]); count2 = count(error2[2]); if (count1 < count2) { return -1; } else if (count1 > count2) { return 1; } else { return 0; } } } private int count(String type) { return type.split(";").length; } }); System.out.println("Ranking of Files:"); for (String[] out : ranking) { System.out.println(out[0]); if (!out[1].equals("")) { System.out.println("Update following databases: " + getErrors(out[1])); } if (!out[2].equals("")) { System.out.println("Annotate and train, because of missing relations: " + getErrors(out[2])); } } } private static String getErrors(String type) { return type.substring(0, type.length() - 1).replace(";", ", "); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy