
org.biojava.nbio.structure.chem.DownloadChemCompProvider Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of biojava-structure Show documentation
Show all versions of biojava-structure Show documentation
The protein structure modules of BioJava.
The newest version!
package org.biojava.nbio.structure.chem;
import org.biojava.nbio.core.util.InputStreamProvider;
import org.biojava.nbio.structure.align.util.URLConnectionTools;
import org.biojava.nbio.structure.align.util.UserConfiguration;
import org.biojava.nbio.structure.io.LocalPDBDirectory;
import org.biojava.nbio.structure.io.cif.ChemCompConverter;
import org.rcsb.cif.ParsingException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;
/**
* This provider of chemical components can download and cache chemical component definition files from the RCSB PDB web
* site. It is the default way to access these definitions. If this provider is called he first time, it will download
* and install all chemical component definitions in a local directory. Once the definition files have been installed,
* it has quick startup time and low memory requirements.
*
* An alternative provider, that keeps all definitions in memory is the {@link AllChemCompProvider}. Another provider,
* that does not require any network access, but only can support a limited set of chemical component definitions, is
* the {@link ReducedChemCompProvider}.
*
* @author Andreas Prlic
*/
public class DownloadChemCompProvider implements ChemCompProvider {
private static final Logger logger = LoggerFactory.getLogger(DownloadChemCompProvider.class);
private static final String NEWLINE = System.getProperty("line.separator");
public static final String CHEM_COMP_CACHE_DIRECTORY = "chemcomp";
public static final String DEFAULT_SERVER_URL = "https://files.rcsb.org/ligands/download/";
public static final String DEFAULT_CHEMCOMP_PATHURL_TEMPLATE = "{ccd_id}.cif";
/**
* The base URL to which the full path specified via {@link #setChemCompPathUrlTemplate(String)} is appended.
* It is assumed that it has a trailing slash.
*/
public static String serverBaseUrl = DEFAULT_SERVER_URL;
private static File path;
private static String chemCompPathUrlTemplate = DEFAULT_CHEMCOMP_PATHURL_TEMPLATE;
static final Pattern CCD_ID_TEMPLATE_REGEX = Pattern.compile("\\{ccd_id(?::(\\d+_\\d+|[-+]?\\d+))?}");
// flags to make sure there is only one thread running that is loading the dictionary
static AtomicBoolean loading = new AtomicBoolean(false);
static final List protectedIDs = new ArrayList<>();
static {
protectedIDs.add("CON");
protectedIDs.add("PRN");
protectedIDs.add("AUX");
protectedIDs.add("NUL");
}
private static ChemCompProvider fallback = null; // Fallback provider if the download fails
/**
* by default we will download only some of the files. User has to request that all files should be downloaded...
*/
boolean downloadAll = false;
public DownloadChemCompProvider() {
this(null);
}
public DownloadChemCompProvider(String cacheFilePath) {
logger.debug("Initialising DownloadChemCompProvider");
// note that path is static, so this is just to make sure that all non-static methods will have path initialised
if (cacheFilePath != null) {
path = new File(cacheFilePath);
}
}
/**
* Set the base URL for the location of all chemical component CIF files, to which the chemCompPathUrlTemplate
* is appended, settable in {@link #setChemCompPathUrlTemplate(String)}. A trailing slash is appended
* if not present.
*/
public static void setServerBaseUrl(String serverBaseUrl) {
if (!serverBaseUrl.endsWith("/")) {
serverBaseUrl = serverBaseUrl + "/";
}
DownloadChemCompProvider.serverBaseUrl = serverBaseUrl;
}
/**
* Set the path to append to the serverBaseUrl (settable in {@link #setServerBaseUrl(String)}).
* The string can contain placeholders that will be expanded at runtime:
*
* - "{ccd_id}" to be replaced by the chemical component identifier, in capitals
* - "{ccd_id:beginIndex-endIndex}" to be replaced by a substring of the chemical component identifier in capitals,
* with indices following the same convention as {@link String#substring(int, int)}
* - "{ccd_id:index}" to be replaced by a substring of the chemical component identifier in capitals,
* with index either a positive or negative integer to substring from left or right of the string respectively.
*
* If any of the indices are off-bounds, then the full chemical component identifier is replaced
*/
public static void setChemCompPathUrlTemplate(String chemCompPathUrlTemplate) {
DownloadChemCompProvider.chemCompPathUrlTemplate = chemCompPathUrlTemplate;
}
/**
* Get this provider's cache path
* @return
*/
public static File getPath() {
if (path == null) {
UserConfiguration config = new UserConfiguration();
path = new File(config.getCacheFilePath());
}
return path;
}
/**
* Checks if the chemical components already have been installed into the PDB directory.
* If not, will download the chemical components definitions file and split it up into small
* subfiles.
*/
public void checkDoFirstInstall() {
if (!downloadAll) {
return;
}
// this makes sure there is a file separator between every component,
// if path has a trailing file separator or not, it will work for both cases
File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY);
File f = new File(dir, "components.cif.gz");
if (!f.exists()) {
downloadAllDefinitions();
} else {
// file exists.. did it get extracted?
FilenameFilter filter = (dir1, file) -> file.endsWith(".cif.gz");
String[] files = dir.list(filter);
if (files.length < 500) {
// not all did get unpacked
try {
split();
} catch (IOException e) {
logger.error("Could not split file {} into individual chemical component files. Error: {}",
f.toString(), e.getMessage());
}
}
}
}
private void split() throws IOException {
logger.info("Installing individual chem comp files ...");
File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY);
File f = new File(dir, "components.cif.gz");
int counter = 0;
InputStreamProvider prov = new InputStreamProvider();
try (BufferedReader buf = new BufferedReader (new InputStreamReader(prov.getInputStream(f)))) {
String line;
line = buf.readLine ();
StringWriter writer = new StringWriter();
String currentID = null;
while (line != null) {
if (line.startsWith("data_")) {
// a new record found!
if (currentID != null) {
writeID(writer.toString(), currentID);
counter++;
}
currentID = line.substring(5);
writer = new StringWriter();
}
writer.append(line);
writer.append(NEWLINE);
line = buf.readLine();
}
// write the last record...
writeID(writer.toString(), currentID);
counter++;
}
logger.info("Created {} chemical component files.", counter);
}
/**
* Output chemical contents to a file
* @param contents File contents
* @param currentID Chemical ID, used to determine the filename
* @throws IOException
*/
private void writeID(String contents, String currentID) throws IOException {
String localName = getLocalFileName(currentID);
try (PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(localName)))) {
pw.print(contents);
pw.flush();
}
}
/**
* Loads the definitions for this {@link ChemComp} from a local file and instantiates a new object.
*
* @param recordName the ID of the {@link ChemComp}
* @return a new {@link ChemComp} definition.
*/
@Override
public ChemComp getChemComp(String recordName) {
// make sure we work with upper case records
recordName = recordName.toUpperCase().trim();
boolean haveFile = true;
if ("?".equals(recordName)) {
return null;
}
if (fileIsAbsent(recordName)) {
// check if we should install all components
checkDoFirstInstall();
}
if (fileIsAbsent(recordName)) {
// we previously have installed already the definitions,
// just do an incrememntal update
haveFile = downloadChemCompRecord(recordName);
}
// Added check that download was successful and chemical component is available.
if (haveFile) {
String filename = getLocalFileName(recordName);
try {
ChemComp chemComp;
try {
ChemicalComponentDictionary dict = ChemCompConverter.fromPath(Paths.get(filename));
chemComp = dict.getChemComp(recordName);
} catch (ParsingException e) {
// happens for corrupt files
chemComp = null;
}
// May be null if the file was corrupt. Fall back on ReducedChemCompProvider in that case
if (chemComp != null) {
return chemComp;
}
} catch (IOException e) {
logger.warn("Could not download chemical component file {} for {}. Error: {}. Now trying to use the " +
"local chemical component definitions.", filename, recordName, e.getMessage());
}
}
// see https://github.com/biojava/biojava/issues/315
// probably a network error happened. Try to use the ReducedChemCOmpProvider
if (fallback == null) {
fallback = new ReducedChemCompProvider();
}
logger.warn("Falling back to ReducedChemCompProvider for {}. This could indicate a network error.", recordName);
return fallback.getChemComp(recordName);
}
/**
* Returns the file name that contains the definition for this {@link ChemComp}
*
* @param recordName the ID of the {@link ChemComp}
* @return full path to the file
*/
public static String getLocalFileName(String recordName) {
if (protectedIDs.contains(recordName)) {
recordName = "_" + recordName;
}
File f = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY);
if (!f.exists()) {
logger.info("Creating directory {}", f);
boolean success = f.mkdir();
// we've checked in initPath that path is writable, so there's no need to check if it succeeds
// in the unlikely case that in the meantime it isn't writable at least we log an error
if (!success) {
logger.error("Directory {} could not be created", f);
}
}
File theFile = new File(f, recordName + ".cif.gz");
return theFile.toString();
}
private static boolean fileIsAbsent(String recordName) {
String fileName = getLocalFileName(recordName);
File f = new File(fileName);
// delete files that are too short to have contents
if (f.length() < LocalPDBDirectory.MIN_PDB_FILE_SIZE) {
// Delete defensively.
// Note that if delete is unsuccessful, we re-download the file anyways
f.delete();
return true;
}
return !f.exists();
}
/**
* Expands the given path URL template, replacing the placeholders as specified in {@link #setChemCompPathUrlTemplate(String)}
* by the ccdId given (or its substrings, if indices are present in the template)
* @param templateStr the template string with placeholders for ccd ids
* @param ccdId the ccd id to replace (in full or a substring)
* @return the input templateStr with placeholders replaced
*/
static String expandPathUrlTemplate(String templateStr, String ccdId) {
Matcher m = CCD_ID_TEMPLATE_REGEX.matcher(templateStr);
StringBuilder output = new StringBuilder();
int lastIndex = 0;
while (m.find()) {
String repString = ccdId;
String indicesStr = m.group(1);
try {
if (indicesStr == null) {
// no substringing
repString = ccdId;
} else if (!indicesStr.contains("_")) {
// left/right substring
int idx = Integer.parseInt(indicesStr);
if (idx < 0) { // right substring
repString = ccdId.substring(ccdId.length() + idx);
} else { // left substring
repString = ccdId.substring(0, idx);
}
} else if (indicesStr.contains("_")) {
// start and end index
String[] tokens = indicesStr.split("_");
int begIdx = Integer.parseInt(tokens[0]);
int endIdx = Integer.parseInt(tokens[1]);
repString = ccdId.substring(begIdx, endIdx);
}
} catch (IndexOutOfBoundsException e) {
// we don't set repString, it keeps original value ccdId
logger.debug("Indices included in path URL template {} are out of bounds for string {}", templateStr, ccdId);
}
output.append(templateStr, lastIndex, m.start()).append(repString);
lastIndex = m.end();
// TODO when we upgrade to java 11, use the new methods introduced in java 9, see https://stackoverflow.com/questions/9605716/java-regular-expression-find-and-replace
}
if (lastIndex < templateStr.length()) {
output.append(templateStr, lastIndex, templateStr.length());
}
return output.toString();
}
/**
* @param recordName : three-letter name
* @return true if successful download
*/
private static boolean downloadChemCompRecord(String recordName) {
String localName = getLocalFileName(recordName);
File newFile;
try {
newFile = Files.createTempFile("chemcomp" + recordName,"cif").toFile();
logger.debug("Will write chem comp file to temp file {}", newFile.toString());
} catch(IOException e) {
logger.error("Could not write to temp directory {} to create the chemical component download temp file", System.getProperty("java.io.tmpdir"));
return false;
}
String u = serverBaseUrl + expandPathUrlTemplate(chemCompPathUrlTemplate, recordName);
logger.debug("Downloading chem comp definition from {}", u);
URL url = null;
try {
url = new URL(u);
URLConnection uconn = URLConnectionTools.openURLConnection(url);
try (PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(newFile)));
BufferedReader fileBuffer = new BufferedReader(new InputStreamReader(uconn.getInputStream()))) {
String line;
boolean success = false;
while ((line = fileBuffer.readLine()) != null) {
pw.println(line);
success = true;
}
if(!success) {
throw new IOException("Malformed URL or no content found in "+url.toString());
}
pw.flush();
}
// Now we move this across to where it actually wants to be
Files.move(newFile.toPath(), Paths.get(localName), StandardCopyOption.REPLACE_EXISTING);
return true;
} catch (IOException e) {
logger.error("Could not download {} OR store locally to {} Error ={}",
url,
localName,
e.getMessage());
newFile.delete();
}
return false;
}
private void downloadAllDefinitions() {
if (loading.get()) {
logger.info("Waiting for other thread to install chemical components...");
}
while (loading.get()) {
// another thread is already downloading the components definitions
// wait for the other thread to finish...
try {
// wait half a second
Thread.sleep(500);
} catch (InterruptedException e) {
//e.printStackTrace();
logger.error("Thread interrupted "+e.getMessage());
}
logger.info("Another thread installed the chemical components.");
return;
}
loading.set(true);
long timeS = System.currentTimeMillis();
logger.info("Performing first installation of chemical components.");
logger.info("Downloading components.cif.gz ...");
try {
AllChemCompProvider.downloadFile();
} catch (IOException e) {
logger.error("Could not download the all chemical components file. Error: {}. "
+ "Chemical components information won't be available", e.getMessage());
// no point in trying to split if the file could not be downloaded
loading.set(false);
return;
}
try {
split();
} catch (IOException e) {
logger.error("Could not split all chem comp file into individual chemical component files. Error: {}",
e.getMessage());
// no point in reporting time
loading.set(false);
return;
}
long timeE = System.currentTimeMillis();
logger.info("time to install chem comp dictionary: " + (timeE - timeS) / 1000 + " sec.");
loading.set(false);
}
/**
* By default this provider will download only some of the {@link ChemComp} files.
* The user has to request that all files should be downloaded by setting this parameter to true.
*
* @return flag if the all components should be downloaded and installed at startup. (default: false)
*/
public boolean isDownloadAll() {
return downloadAll;
}
/** By default this provider will download only some of the {@link ChemComp} files.
* The user has to request that all files should be downloaded by setting this parameter to true.
*
* @param downloadAll if the all components should be downloaded and installed at startup. (default: false)
*/
public void setDownloadAll(boolean downloadAll) {
this.downloadAll = downloadAll;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy