de.julielab.xmlData.cli.ExtractDeleteCitations Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of costosys Show documentation
Show all versions of costosys Show documentation
A utility for managing documents stored in a PostgreSQL database. The documents are imported into a
PostgreSQL DB as full texts with the goal to be able to retrieve the documents by their PubMedID efficiently.
For more sophisticated tasks, a user configuration file can be delivered which can take control of the table
schema to use, the PostgreSQL schema to use and the actual database server to connect to as well as the concrete
database.
/**
* ExtractDeleteCitations.java
*
* Copyright (c) 2010, JULIE Lab.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Common Public License v1.0
*
* Author: chew
*
* Current version: 1.0
* Since version: 1.0
*
* Creation date: 14.12.2010
**/
package de.julielab.xmlData.cli;
import java.io.File;
import java.io.FilenameFilter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import de.julielab.xml.JulieXMLConstants;
import de.julielab.xml.JulieXMLTools;
import de.julielab.xmlData.Constants;
/**
* Extracts PMIDs of deleted Medline documents from Medline Update XML batches.
* Currently the path to the XML files is hard coded, this should be made more
* flexible.
*
* @author faessler
*/
public class ExtractDeleteCitations {
private static final Logger LOG = LoggerFactory
.getLogger(ExtractDeleteCitations.class);
public static void main(String[] args) {
extractDeletedPMIDs();
}
private static void extractDeletedPMIDs() {
LOG.info("Starting extraction...");
File baseDir = new File("/data/data_corpora/medline/updates");
if (!baseDir.isDirectory()) {
LOG.error(String.format(
"Path %s does not point to a directory.",
baseDir.getAbsolutePath()));
System.exit(1);
}
String[] fileNames = baseDir.list(new FilenameFilter() {
public boolean accept(File arg0, String arg1) {
return arg1.endsWith(".gz");
}
});
String forEachXpath = "/MedlineCitationSet/DeleteCitation/PMID";
List