All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.citec.scie.Main Maven / Gradle / Ivy

Go to download

Contains the SCIE main application and the CLI interface. This project integrates the named entity recognition (NER), the PDF import and the classification and interfaces with the UIMA framework. The command line interface can be used to produce a set of UIMA XCAS files.

There is a newer version: 2.0.1
Show newest version
/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package de.citec.scie;

import de.citec.scie.descriptors.Error;
import de.citec.scie.ner.db.generic.DatabaseSingleton;
import de.citec.scie.ner.db.mapdb.MapDBDatabase;
import de.citec.scie.typesystem.Typesystem;
import static de.citec.scie.Constants.PROJECT_NAME;
import static de.citec.scie.Constants.TYPESYSTEM;
import java.io.File;
import java.io.FileFilter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UIMAException;
import org.apache.uima.fit.util.CasIOUtil;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;

/**
 * This is the command line interface for the SCIE project. It can be called in
 * two basic modes:
 * 
  • Annotation, which takes a .pdf file (or multiple .pdf files) as input * and returns the annotations as XCAS output
  • *
  • Training, which takes two directories (input and output). It expects * tuples of .rel and XCAS files in the input folder and writes the trained * annotator models to the output. Optionally you can specify even more details * about the training.
* * @author Benjamin Paassen - [email protected] */ public class Main { private static List parseArgumentString(String str) { List list = new ArrayList<>(); Matcher m = Pattern.compile("([^\"]\\S*|\".+?\")\\s*").matcher(str); while (m.find()) { list.add(m.group(1).replace("\"", "")); } return list; } private static String readLine(InputStream is) throws IOException { StringBuilder sb = new StringBuilder(); int data; while ((data = is.read()) != -1) { if (data == '\n') { return sb.toString(); } sb.append((char) data); } return null; } public static void main(String[] args) throws IOException { if (args.length < 1) { printHelp(); return; } try { switch (args[0]) { case "--annotation": { if (args.length < 3) { throw new UnsupportedOperationException("Expected a database file and an input path as second argument!"); } int i = 1; // Read the force tag boolean force = args[i].equals("--force") || args[i].equals("-f"); if (force) { i++; } // Read the database file, open the database read only! File dbFile = new File(args[i++]); if (!dbFile.isFile()) { throw new UnsupportedOperationException("Database file " + dbFile.getPath() + " does not exist!"); } DatabaseSingleton.initialize(new MapDBDatabase(dbFile, true, false)); // Read the input files File input = new File(args[i++]); final PDFFilter pdfFilter = new PDFFilter(); final File[] pdfs; if (input.isFile()) { if (!pdfFilter.accept(input)) { throw new UnsupportedOperationException("The given file is no PDF: " + input.getAbsolutePath()); } pdfs = new File[]{input}; } else { pdfs = input.listFiles(pdfFilter); if (pdfs == null || pdfs.length == 0) { throw new UnsupportedOperationException("The given directory contained no PDFs: " + input.getAbsolutePath()); } } // Read the output directory final File outputFolder; if (args.length < i + 1) { outputFolder = new File("."); System.out.println("No output directory was given. Using " + outputFolder.getAbsolutePath() + " per default."); } else { outputFolder = new File(args[i++]); if (!outputFolder.isDirectory()) { throw new UnsupportedOperationException("The given output path is no directory: " + outputFolder.getAbsolutePath()); } if (!outputFolder.exists()) { outputFolder.mkdirs(); System.out.println("Creating directory: " + outputFolder.getAbsolutePath()); } } // Start the annotation process annotate(pdfs, outputFolder, force); break; } case "--daemon": { if (args.length < 2) { throw new UnsupportedOperationException("Expected database file as second argument!"); } // Read the database file, open the database read only! File dbFile = new File(args[1]); if (!dbFile.isFile()) { throw new UnsupportedOperationException("Database file " + dbFile.getPath() + " does not exist!"); } DatabaseSingleton.initialize(new MapDBDatabase(dbFile, true, false)); // Run the interactive daemon runDaemon(); break; } case "--training": if (args.length < 2) { throw new UnsupportedOperationException("Expected an input path as second argument!"); } int j = 1; boolean force = args[j].equals("--force") || args[j].equals("-f"); if (force) { j++; } File input = new File(args[j++]); final RelFilter relFilter = new RelFilter(); final File[] relFiles; if (input.isDirectory()) { relFiles = input.listFiles(relFilter); if (relFiles == null || relFiles.length == 0) { throw new UnsupportedOperationException("The given directory contained no .rel-files: " + input.getAbsolutePath()); } } else { throw new UnsupportedOperationException("Expecting a directory as input!"); } final String classifierSpec; if (args.length < j + 1) { throw new UnsupportedOperationException("You did not specify (a) classifier(s) to train!"); } else { classifierSpec = args[j++]; } final File outputFolder; if (args.length < j + 1) { outputFolder = new File("."); System.out.println("No output directory was given. Using " + outputFolder.getAbsolutePath() + " per default."); } else { outputFolder = new File(args[j++]); if (!outputFolder.isDirectory()) { throw new UnsupportedOperationException("The given output path is no directory: " + outputFolder.getAbsolutePath()); } if (!outputFolder.exists()) { outputFolder.mkdirs(); System.out.println("Creating directory: " + outputFolder.getAbsolutePath()); } } Training.train(relFiles, classifierSpec, outputFolder, force); break; case "--rankingTrain": if (args.length < 2) { throw new UnsupportedOperationException("Expected an input path as second argument!"); } else { input = new File(args[1]); if (!input.isDirectory()) { throw new UnsupportedOperationException("Expecting a directory as input!"); } if (!input.exists()) { throw new UnsupportedOperationException("No directory exists at given input path: " + input.getAbsolutePath()); } rankingTrain(input); } break; default: printHelp(); break; } } finally { DatabaseSingleton.uninitialize(); } System.exit(0); } private static final class PDFFilter implements FileFilter { @Override public boolean accept(File file) { return file.getName().toLowerCase().endsWith(".pdf"); } } private static final class RelFilter implements FileFilter { @Override public boolean accept(File file) { return file.getName().toLowerCase().endsWith(".rel"); } } private static void runDaemon() throws IOException { final Annotator annotator = new Annotator(); String cmd; while ((cmd = readLine(System.in)) != null) { if (cmd.startsWith("EXEC")) { List cmds = parseArgumentString(cmd); if (cmds.size() != 5) { System.err.println("Error: Invalid command token count"); continue; } String type = cmds.get(1); if (!type.equals("PDF") && !type.equals("TXT")) { System.err.println("Error: Inalid input type " + type); continue; } String outputToken = cmds.get(2); File inFile = new File(cmds.get(3)); File outFile = new File(cmds.get(4)); // Reprint the parameters to stdout System.out.println("START " + type + " " + outputToken + " \"" + inFile.getPath() + "\" " + " \"" + outFile.getPath() + "\" "); // Perform the actual annotation try { CasIOUtil.writeXCas(annotator.annotateInputFile(inFile, type.equals("PDF")), outFile); System.out.println("DONE " + outputToken); } catch (UIMAException | IOException ex) { System.out.println("ERR " + outputToken); ex.printStackTrace(System.err); } System.out.flush(); } } } private static void printHelp() { System.out.println("SCIE -- Spinal Cord Injury Ontology Extraction"); System.out.println("Copyright (C) 2013, 2014\n"); System.out.println("Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel"); System.out.println(""); System.out.println("You can use this program in three modes:"); System.out.println("--annotation"); System.out.println("and"); System.out.println("--training"); System.out.println("and"); System.out.println("--rankingTrain"); System.out.println("You must specify the mode as first command line argument."); System.out.println(""); System.out.println("1.) Annotation Mode"); System.out.println(""); System.out.println("Command Line Usage:"); System.out.println("java -jar " + PROJECT_NAME + ".jar --annotation [--force|-f] [output_path]"); System.out.println("where either leads to a single PDF file " + "or to a directory containing PDF files"); System.out.println("Additionally you may set the 'force' flag such that " + "an output file that is already present is overriden."); System.out.println("Additionally you may specify an output folder where " + "the output is stored. Per default the current working " + "directory is used."); System.out.println(""); System.out.println("In Annotation mode the program will annotate the " + "given PDF files and create an XCAS file containing " + "annotations for each one."); System.out.println(""); System.out.println("2.) Training Mode"); System.out.println(""); System.out.println("Command Line Usage:"); System.out.println("java -jar " + PROJECT_NAME + ".jar --training [--force|-f] [output_path]"); System.out.println("where leads to a directory containing " + "tuples of .rel files and XCAS files."); System.out.println("Additionally you may set the 'force' flag such that " + "an output file that is already present is overriden."); System.out.println("Additionally you may specify an output folder where " + "the output is stored. Per default the current working " + "directory is used."); System.out.println(""); System.out.println("In Training Mode the program will take the given " + "tuples of .rel and XCAS file as training data and will " + "retrain all classifiers using the given data. The training " + "results are stored in 3 files for each classifier:"); System.out.println("* a .model file containing the LibLinear parameters"); System.out.println("* a .features file specifying the non-zero features"); System.out.println("* an .eval file containing the evaluation results for the training of this classifier."); System.out.println("Note that we currently do not support setting " + "training parameters using the command line. Unfortunately you " + "still need to change the source code (in the Main class) " + "to do that."); System.out.println(""); System.out.println("3.) Ranking for Training Mode"); System.out.println(""); System.out.println("java -jar " + PROJECT_NAME + ".jar --rankingTrain "); System.out.println("where leads to a directory containing " + "XCAS files."); System.out.println(""); System.out.println("In Ranking for Training Mode the programm will take the given" + "XCAS files and calculate wich one should be annotated by hand and trained to the programm to improve" + "the programm."); } private static void annotate(final File[] pdfs, final File outputFolder, final boolean force) { Annotator annotator = new Annotator(); for (final File pdf : pdfs) { System.out.println("Processing file " + pdf.getAbsolutePath()); try { // Make sure the output file is not overridden unless this is // explicitly wanted final File outputFile = new File(outputFolder, pdf.getName().replace(".pdf", ".xml")); if (outputFile.exists()) { if (!force) { System.out.println("Ignoring file because output file " + outputFile.getAbsolutePath() + " already exists!"); continue; } else { outputFile.delete(); } } // Perform the actual annotation CasIOUtil.writeXCas(annotator.annotateInputFile(pdf, true), outputFile); } catch (UIMAException | IOException ex) { System.err.println("The file could not be processed because of " + "an exception during parsing:"); ex.printStackTrace(System.err); } } } private static void rankingTrain(File input) { FilenameFilter filter = new FilenameFilter() { @Override public boolean accept(File dir, String name) { String lower = name.toLowerCase(); return lower.endsWith(".xml"); } }; File[] xcasFiles = input.listFiles(filter); JCas jcas; ArrayList ranking = new ArrayList(); for (File xcas : xcasFiles) { try { String[] entry = new String[3]; entry[0] = xcas.getName(); jcas = Typesystem.getJCas(TYPESYSTEM); CasIOUtil.readXCas(jcas, xcas); Collection errors = JCasUtil.select(jcas, Error.class); for (Error error : errors) { entry[1] = error.getDatabases(); entry[2] = error.getRelations(); } if (!entry[1].equals("") || ! !entry[2].equals("")) { ranking.add(entry); } } catch (IOException ex) { System.err.println("The file " + xcas.getName() + " could not be processed because of an exception during parsing:"); ex.printStackTrace(System.err); } } Collections.sort(ranking, new Comparator() { @Override public int compare(String[] error1, String[] error2) { int count1 = count(error1[1]); int count2 = count(error2[1]); if (count1 < count2) { return -1; } else if (count1 > count2) { return 1; } else { count1 = count(error1[2]); count2 = count(error2[2]); if (count1 < count2) { return -1; } else if (count1 > count2) { return 1; } else { return 0; } } } private int count(String type) { return type.split(";").length; } }); System.out.println("Ranking of Files:"); for (String[] out : ranking) { System.out.println(out[0]); if (!out[1].equals("")) { System.out.println("Update following databases: " + getErrors(out[1])); } if (!out[2].equals("")) { System.out.println("Annotate and train, because of missing relations: " + getErrors(out[2])); } } } private static String getErrors(String type) { return type.substring(0, type.length() - 1).replace(";", ", "); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy