All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.xmlData.cli.ExtractDeleteCitations Maven / Gradle / Ivy

Go to download

A utility for managing documents stored in a PostgreSQL database. The documents are imported into a PostgreSQL DB as full texts with the goal to be able to retrieve the documents by their PubMedID efficiently. For more sophisticated tasks, a user configuration file can be delivered which can take control of the table schema to use, the PostgreSQL schema to use and the actual database server to connect to as well as the concrete database.

There is a newer version: 1.6.2
Show newest version
/**
 * ExtractDeleteCitations.java
 *
 * Copyright (c) 2010, JULIE Lab.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Common Public License v1.0
 *
 * Author: chew
 *
 * Current version: 1.0
 * Since version:   1.0
 *
 * Creation date: 14.12.2010
 **/

package de.julielab.xmlData.cli;

import java.io.File;
import java.io.FilenameFilter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import de.julielab.xml.JulieXMLConstants;
import de.julielab.xml.JulieXMLTools;
import de.julielab.xmlData.Constants;

/**
 * Extracts PMIDs of deleted Medline documents from Medline Update XML batches.
 * Currently the path to the XML files is hard coded, this should be made more
 * flexible.
 * 
 * @author faessler
 */
public class ExtractDeleteCitations {

	private static final Logger LOG = LoggerFactory
			.getLogger(ExtractDeleteCitations.class);

	public static void main(String[] args) {
		extractDeletedPMIDs();
	}
	
	private static void extractDeletedPMIDs() {
		LOG.info("Starting extraction...");
		File baseDir = new File("/data/data_corpora/medline/updates");
		if (!baseDir.isDirectory()) {
			LOG.error(String.format(
					"Path %s does not point to a directory.",
					baseDir.getAbsolutePath()));
			System.exit(1);
		}
		String[] fileNames = baseDir.list(new FilenameFilter() {
			public boolean accept(File arg0, String arg1) {
				return arg1.endsWith(".gz");
			}
		});

		String forEachXpath = "/MedlineCitationSet/DeleteCitation/PMID";
		List> fields = new ArrayList>();
		Map field = new HashMap();
		field.put(JulieXMLConstants.NAME, Constants.PMID_FIELD_NAME);
		field.put(JulieXMLConstants.XPATH,
				"/MedlineCitationSet/DeleteCitation/PMID");
		fields.add(field);

		int bufferSize = 1000;
		for (String fileName : fileNames) {
			Iterator> it = JulieXMLTools.constructRowIterator(
					baseDir.getAbsolutePath() + "/" + fileName, bufferSize,
					forEachXpath, fields, false);
			
			while (it.hasNext()) {
				Map row = it.next();
				String pmid = (String) row.get(Constants.PMID_FIELD_NAME);
				System.out.println(pmid);
			}
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy