All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.xmlData.cli.QueryPubMed Maven / Gradle / Ivy

Go to download

A utility for managing documents stored in a PostgreSQL database. The documents are imported into a PostgreSQL DB as full texts with the goal to be able to retrieve the documents by their PubMedID efficiently. For more sophisticated tasks, a user configuration file can be delivered which can take control of the table schema to use, the PostgreSQL schema to use and the actual database server to connect to as well as the concrete database.

There is a newer version: 1.6.2
Show newest version
package de.julielab.xmlData.cli;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;

import com.ximpleware.AutoPilot;
import com.ximpleware.EOFException;
import com.ximpleware.EncodingException;
import com.ximpleware.EntityException;
import com.ximpleware.NavException;
import com.ximpleware.ParseException;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;

import de.julielab.xml.JulieXMLTools;

public class QueryPubMed {
	private final static String SITE = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi";
	private final static String RETMAX = "100000000"; // 5x the size of PubMed (2011)
	private final static int BUFFERSIZE = 1024;
	private final static String XPATH = "/eSearchResult/IdList/Id";

	/**
	 * Query PubMed via REST-API, returning up to 10e8 matched PMIDs.
	 * Queried terms get expanded, e.g. "Il-1" will match "interleukin-1".
	 * Searches with really many results (e.g. "cancer") need increased heap space!
	 * More details: http://eutils.ncbi.nlm.nih.gov/corehtml/query/static/esearch_help.html 
	 * 
	 * @param query -Query for PubMed as a String
	 * @return - ArrayList, containing PMIDs as Strings
	 */
	public static ArrayList query(String query) {
		ArrayList ids = new ArrayList();
		try {
			StringBuilder queryBuilder = new StringBuilder();
			queryBuilder.append(SITE)
					.append("?term=").append(URLEncoder.encode(query, "UTF-8"))
					.append("&retmax=").append(RETMAX).append("&tool=julie-medline-manager")
					.append("[email protected]");
			URL url = new URL(queryBuilder.toString());
			InputStream stream = url.openStream();	
			
			VTDGen vg = new VTDGen(); // Parses XML
			vg.setDoc(JulieXMLTools.readStream(stream, BUFFERSIZE));
			vg.parse(true);
			VTDNav vn = vg.getNav(); // Navigates in parsed XML
			AutoPilot ap = new AutoPilot(vn); // Moves through whole XML
			
			ap.selectXPath(XPATH);
			while (ap.evalXPath() != -1) {
				// 32 bits encoding length, 32 bits encoding offset
				long fragment = vn.getContentFragment(); 
				// right 32 bits
				int offset = (int) fragment; 
				// left 32 bits, casts priority is higher than right-shifts
				int length = (int) (fragment >> 32); 
				ids.add(vn.toString(offset, length));
			}
			
		} catch (IOException e) {
			e.printStackTrace();
		} catch (EncodingException e) {
			e.printStackTrace();
		} catch (EOFException e) {
			e.printStackTrace();
		} catch (EntityException e) {
			e.printStackTrace();
		} catch (ParseException e) {
			e.printStackTrace();
		} catch (XPathParseException e) {
			e.printStackTrace();
		} catch (XPathEvalException e) {
			e.printStackTrace();
		} catch (NavException e) {
			e.printStackTrace();
		}
		
		return ids;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy