de.julielab.xmlData.cli.QueryPubMed Maven / Gradle / Ivy

Go to download
package de.julielab.xmlData.cli;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;

import com.ximpleware.AutoPilot;
import com.ximpleware.EOFException;
import com.ximpleware.EncodingException;
import com.ximpleware.EntityException;
import com.ximpleware.NavException;
import com.ximpleware.ParseException;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;

import de.julielab.xml.JulieXMLTools;

public class QueryPubMed {
	private final static String SITE = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi";
	private final static String RETMAX = "100000000"; // 5x the size of PubMed (2011)
	private final static int BUFFERSIZE = 1024;
	private final static String XPATH = "/eSearchResult/IdList/Id";

	/**
	 * Query PubMed via REST-API, returning up to 10e8 matched PMIDs.
	 * Queried terms get expanded, e.g. "Il-1" will match "interleukin-1".
	 * Searches with really many results (e.g. "cancer") need increased heap space!
	 * More details: http://eutils.ncbi.nlm.nih.gov/corehtml/query/static/esearch_help.html 
	 * 
	 * @param query -Query for PubMed as a String
	 * @return - ArrayList, containing PMIDs as Strings
	 */
	public static ArrayList query(String query) {
		ArrayList ids = new ArrayList();
		try {
			StringBuilder queryBuilder = new StringBuilder();
			queryBuilder.append(SITE)
					.append("?term=").append(URLEncoder.encode(query, "UTF-8"))
					.append("&retmax=").append(RETMAX).append("&tool=julie-medline-manager")
					.append("[email protected]");
			URL url = new URL(queryBuilder.toString());
			InputStream stream = url.openStream();	
			
			VTDGen vg = new VTDGen(); // Parses XML
			vg.setDoc(JulieXMLTools.readStream(stream, BUFFERSIZE));
			vg.parse(true);
			VTDNav vn = vg.getNav(); // Navigates in parsed XML
			AutoPilot ap = new AutoPilot(vn); // Moves through whole XML
			
			ap.selectXPath(XPATH);
			while (ap.evalXPath() != -1) {
				// 32 bits encoding length, 32 bits encoding offset
				long fragment = vn.getContentFragment(); 
				// right 32 bits
				int offset = (int) fragment; 
				// left 32 bits, casts priority is higher than right-shifts
				int length = (int) (fragment >> 32); 
				ids.add(vn.toString(offset, length));
			}
			
		} catch (IOException e) {
			e.printStackTrace();
		} catch (EncodingException e) {
			e.printStackTrace();
		} catch (EOFException e) {
			e.printStackTrace();
		} catch (EntityException e) {
			e.printStackTrace();
		} catch (ParseException e) {
			e.printStackTrace();
		} catch (XPathParseException e) {
			e.printStackTrace();
		} catch (XPathEvalException e) {
			e.printStackTrace();
		} catch (NavException e) {
			e.printStackTrace();
		}
		
		return ids;
	}
}