Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.rcsb.strucmotif.update.StrucmotifUpdate Maven / Gradle / Ivy
package org.rcsb.strucmotif.update;
import com.google.gson.Gson;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import org.rcsb.cif.CifIO;
import org.rcsb.cif.ParsingException;
import org.rcsb.cif.schema.StandardSchemata;
import org.rcsb.cif.schema.mm.MmCifBlock;
import org.rcsb.cif.schema.mm.MmCifFile;
import org.rcsb.cif.schema.mm.PdbxAuditRevisionHistory;
import org.rcsb.cif.schema.mm.PdbxStructAssembly;
import org.rcsb.cif.schema.mm.PdbxStructAssemblyGen;
import org.rcsb.cif.schema.mm.PdbxStructOperList;
import org.rcsb.strucmotif.config.StrucmotifConfig;
import org.rcsb.strucmotif.domain.motif.ResiduePairOccurrence;
import org.rcsb.strucmotif.domain.structure.ResidueGraph;
import org.rcsb.strucmotif.domain.structure.Structure;
import org.rcsb.strucmotif.domain.structure.StructureInformation;
import org.rcsb.strucmotif.io.InvertedIndex;
import org.rcsb.strucmotif.io.StateRepository;
import org.rcsb.strucmotif.io.StructureDataProvider;
import org.rcsb.strucmotif.io.StructureIndexProvider;
import org.rcsb.strucmotif.math.Partition;
import org.rcsb.strucmotif.update.extractor.KeyExtractorFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.data.mongo.MongoDataAutoConfiguration;
import org.springframework.boot.autoconfigure.domain.EntityScan;
import org.springframework.boot.autoconfigure.mongo.MongoAutoConfiguration;
import org.springframework.context.annotation.ComponentScan;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.UncheckedIOException;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Runs strucmotif updates from the command-line.
*/
@SpringBootApplication(exclude = { MongoAutoConfiguration.class, MongoDataAutoConfiguration.class })
@ComponentScan({"org.rcsb.strucmotif"})
@EntityScan("org.rcsb.strucmotif")
public class StrucmotifUpdate implements CommandLineRunner {
private static final Logger logger = LoggerFactory.getLogger(StrucmotifUpdate.class);
private static final Set STRUCTURE_EXTENSIONS = Set.of(".cif", ".cif.gz", ".bcif", ".bcif.gz");
private static final String FULL = "full";
private static final String FULL_CSM = "full_csm";
/**
* Entry point from the command-line.
* @param args command-line arguments
*/
public static void main(String[] args) {
SpringApplication.run(StrucmotifUpdate.class, args);
}
private final StateRepository stateRepository;
private final StructureDataProvider structureDataProvider;
private final InvertedIndex invertedIndex;
private final StrucmotifConfig strucmotifConfig;
private final StructureIndexProvider structureIndexProvider;
private final ResidueGraph.ResidueGraphOptions residueGraphOptions;
/**
* Injectable constructor.
* @param stateRepository the state repo
* @param structureDataProvider data provider
* @param invertedIndex inverted index
* @param strucmotifConfig configs
* @param structureIndexProvider index provider
*/
@Autowired
public StrucmotifUpdate(StateRepository stateRepository, StructureDataProvider structureDataProvider, InvertedIndex invertedIndex, StrucmotifConfig strucmotifConfig, StructureIndexProvider structureIndexProvider) {
this.stateRepository = stateRepository;
this.structureDataProvider = structureDataProvider;
this.invertedIndex = invertedIndex;
this.strucmotifConfig = strucmotifConfig;
this.structureIndexProvider = structureIndexProvider;
switch (strucmotifConfig.getResidueGraphStrategy()) {
case DEPOSITED -> this.residueGraphOptions = ResidueGraph.ResidueGraphOptions.deposited();
case RESIDUES_IN_CONTACT -> this.residueGraphOptions = ResidueGraph.ResidueGraphOptions.residuesInContact();
case CHAINS_IN_CONTACT -> this.residueGraphOptions = ResidueGraph.ResidueGraphOptions.chainsInContact();
case ALL -> this.residueGraphOptions = ResidueGraph.ResidueGraphOptions.all();
default -> throw new IllegalArgumentException("Don't know how to handle " + strucmotifConfig.getResidueGraphStrategy());
}
}
/**
* Actual run method with the given arguments.
* @param args command-line arguments
* @throws Exception update failure
*/
public void run(String[] args) throws Exception {
if (args.length < 1) {
printUsage();
return;
}
logger.info("Strucmotif config used for loading is:");
for (Method method : strucmotifConfig.getClass().getMethods()) {
int modifiers = method.getModifiers();
String name = method.getName();
if (Modifier.isStatic(modifiers) || !Modifier.isPublic(modifiers) || !name.startsWith("get") || name.equals("getClass")) {
continue;
}
logger.info(" strucmotif.{}: {}", kebabCase(name.replace("get", "")), method.invoke(strucmotifConfig));
}
// determine identifiers requested by user - either provided collection or all currently reported identifiers by RCSB PDB
Operation operation = parseOperation(args);
List requested = parseUpdateList(args);
// check for sanity of internal state
if (operation != Operation.RECOVER) {
Set dirtyStructureIdentifiers = stateRepository.selectDirty();
if (dirtyStructureIdentifiers.size() > 0) {
logger.warn("Update state is dirty - Problematic identifiers:\n{}",
dirtyStructureIdentifiers);
logger.info("Recovering from dirty state");
recover(dirtyStructureIdentifiers);
}
}
// ensure no partial files linger
deletePartialFiles();
logger.info("Starting update - Operation: {}, {} ids ({})",
operation,
requested.size(),
requested.stream()
.limit(5)
.map(item -> "\"" + item.getStructureIdentifier() + "\"")
.collect(Collectors.joining(", ", "[", requested.size() > 5 ? ", ...]" : "]")));
switch (operation) {
case ADD -> add(new Context(strucmotifConfig, getDeltaPlusIdentifiers(requested)));
case REMOVE -> remove(getDeltaMinusIdentifiers(requested));
case RECOVER -> recover(stateRepository.selectDirty());
}
// ensure no partial files files
deletePartialFiles();
logger.info("Finished update operation");
}
private static final String RE1 = "([a-z0-9])([A-Z])";
private static final String RE2 = "([A-Z])([A-Z])(?=[a-z])";
private static final String REPLACEMENT = "$1-$2";
private String kebabCase(String s) {
return s.replaceAll(RE1, REPLACEMENT)
.replaceAll(RE2, REPLACEMENT)
.toLowerCase();
}
private UpdateItem mapFile(Path path) {
try {
return new UpdateItem(KeyExtractorFactory.getKey(path.toFile().getName()), path.toUri().toURL());
} catch (MalformedURLException e) {
throw new UncheckedIOException(e);
}
}
/**
* The 'ADD' operation.
* @param context the set of UpdateItems to add
* @throws IOException writing to index fails
*/
public void add(Context context) throws IOException {
long target = context.updateItems.size();
logger.info("{} files to process in total", target);
Partition partitions = new Partition<>(context.updateItems, strucmotifConfig.getCommitInterval());
if (partitions.size() > 1) {
logger.info("Formed {} partitions of {} structures", partitions.size(), strucmotifConfig.getCommitInterval());
}
Set known = getKnown();
boolean needsCommit = false;
// split into partitions and process
for (int i = 0; i < partitions.size(); i++) {
List partition = partitions.get(i);
context.partitionSize = partition.size();
context.partitionContext = (i + 1) + " / " + partitions.size();
context.structureCounter = new AtomicInteger();
partition.parallelStream().forEach(item -> handleUpdateItem(item, context));
needsCommit = true;
if (context.structureCounter.get() > 0) {
commit(context, known);
needsCommit = false;
}
}
if (needsCommit) {
commit(context, known);
}
}
private void commit(Context context, Set known) throws IOException {
logger.info("Committing data on {} structures", context.processed.size());
// flush per-thread streams
context.flush();
// mark as dirty before index update
Set dirty = context.processed.stream()
.map(StructureInformation::structureIdentifier)
// ignore items detected as known after reading entry.id (happens when processing URLs)
.filter(id -> !known.contains(id))
.collect(Collectors.toSet());
stateRepository.insertDirty(dirty);
structureDataProvider.commit();
invertedIndex.commit();
// processed contains all StructureIdentifiers + corresponding revision
stateRepository.insertKnown(context.processed);
stateRepository.deleteDirty(context.processed.stream().map(StructureInformation::structureIdentifier).collect(Collectors.toSet()));
// reset state
context.processed.clear();
context.close();
}
private void handleUpdateItem(UpdateItem item, Context context) {
int maxRetries = strucmotifConfig.getDownloadTries();
for (int i = 1; i <= maxRetries; i++) {
try {
handleUpdateItemInternal(item, context);
break;
} catch (UncheckedIOException | ParsingException e) {
if (i >= maxRetries) {
// max retries exceeded
throw e;
}
int count = context.structureCounter.get();
String source = item.getUrl() != null ? item.getUrl().toString() : item.getStructureIdentifier();
String structureContext = count + " / " + context.partitionSize + "] [" + source;
logger.warn("[{}] [{}] [try: {} / {}] Failed to download or parse source file - {}",
context.partitionContext, structureContext, i, maxRetries, e.getMessage());
}
}
}
private void handleUpdateItemInternal(UpdateItem item, Context context) {
String structureIdentifier = item.getStructureIdentifier();
int structureIndex = structureIndexProvider.nextStructureIndex();
try {
InputStream inputStream = handleInputStream(item, context);
// get some clean metadata
MmCifFile mmCifFile = CifIO.readFromInputStream(inputStream).as(StandardSchemata.MMCIF);
MmCifBlock block = mmCifFile.getFirstBlock();
// assert that all categories are present
PdbxAuditRevisionHistory revisionHistory = block.getPdbxAuditRevisionHistory();
int majorRevision = 1;
int minorRevision = 0;
if (revisionHistory.isDefined()) {
majorRevision = revisionHistory.getMajorRevision().get(revisionHistory.getRowCount() - 1);
minorRevision = revisionHistory.getMinorRevision().get(revisionHistory.getRowCount() - 1);
} else {
switch (strucmotifConfig.getMissingRevisionStrategy()) {
case IGNORE -> {}
case WARN -> logger.warn("[{}] [{}] 'pdbx_audit_revision_history' is undefined, defaulting to 1.0", context.partitionContext, structureIdentifier);
case FAIL -> throw new IllegalArgumentException("'pdbx_audit_revision_history' is mandatory in input files - rejecting " + structureIdentifier);
}
}
PdbxStructAssembly pdbxStructAssembly = block.getPdbxStructAssembly();
PdbxStructAssemblyGen pdbxStructAssemblyGen = block.getPdbxStructAssemblyGen();
PdbxStructOperList pdbxStructOperList = block.getPdbxStructOperList();
if (!pdbxStructAssembly.isDefined() || !pdbxStructAssemblyGen.isDefined() || !pdbxStructOperList.isDefined()) {
switch (strucmotifConfig.getMissingAssemblyStrategy()) {
case IGNORE -> {}
case WARN -> logger.warn("[{}] [{}] 'pdbx_struct_assembly', 'pdbx_struct_assembly_gen', or 'pdbx_struct_oper_list' are undefined", context.partitionContext, structureIdentifier);
case FAIL -> throw new IllegalArgumentException("'pdbx_struct_assembly', 'pdbx_struct_assembly_gen', and 'pdbx_struct_oper_list' are mandatory in input files - rejecting " + structureIdentifier);
}
}
// write renumbered structure
logger.debug("[{}] [{}] Writing renumbered structure file", context.partitionContext, structureIdentifier);
structureDataProvider.writeRenumbered(structureIdentifier, mmCifFile);
context.processed.add(new StructureInformation(structureIdentifier, structureIndex, majorRevision, minorRevision));
} catch (IOException e) {
throw new UncheckedIOException("Cif parsing failed for " + structureIdentifier, e);
} catch (ParsingException e) {
throw new ParsingException("Cif parsing failed for " + structureIdentifier, e);
}
int count = context.structureCounter.incrementAndGet();
String structureContext = count + " / " + context.partitionSize + "] [" + structureIdentifier;
// fails when file is missing, expected when e.g. all residues of the file where below the pLDDT threshold
Structure structure;
try {
structure = structureDataProvider.readRenumbered(structureIdentifier);
} catch (UncheckedIOException e) {
logger.warn("[{}] [{}] No renumbered source file present - Skipping", context.partitionContext, structureContext);
return;
}
try {
long startGraph = System.nanoTime();
ResidueGraph residueGraph = new ResidueGraph(structure, strucmotifConfig, residueGraphOptions);
logger.info("[{}] [{}] Computed residue graph ({} residues, {} pairs) in {} ms",
context.partitionContext,
structureContext,
residueGraph.getResidueCount(),
residueGraph.getPairingCount(),
(System.nanoTime() - startGraph) / 1000 / 1000);
// extract motifs, sort into bins of same prefix
long startWrite = System.nanoTime();
int magicStructureIndex = structureIndex | 1 << 31;
byte[] buffer = new byte[8];
AtomicInteger structureMotifCounter = new AtomicInteger();
AtomicInteger lastDescriptor = new AtomicInteger(); // 0 is safe as nothing should be AA-0-0-0
residueGraph.residuePairOccurrencesSequential()
.sorted(Comparator.comparingInt(ResiduePairOccurrence::getResiduePairDescriptor))
.forEach(o -> {
try {
int descriptor = o.getResiduePairDescriptor();
OutputStream outputStream = context.getOutputStream(descriptor);
if (descriptor != lastDescriptor.get()) {
buffer[0] = (byte) (magicStructureIndex >>> 24);
buffer[1] = (byte) (magicStructureIndex >>> 16);
buffer[2] = (byte) (magicStructureIndex >>> 8);
buffer[3] = (byte) magicStructureIndex;
buffer[4] = (byte) (descriptor >>> 24);
buffer[5] = (byte) (descriptor >>> 16);
buffer[6] = (byte) (descriptor >>> 8);
buffer[7] = (byte) descriptor;
outputStream.write(buffer);
lastDescriptor.set(descriptor);
}
long identifier = o.getResiduePairIdentifier();
buffer[0] = (byte) (identifier >>> 56);
buffer[1] = (byte) (identifier >>> 48);
buffer[2] = (byte) (identifier >>> 40);
buffer[3] = (byte) (identifier >>> 32);
buffer[4] = (byte) (identifier >>> 24);
buffer[5] = (byte) (identifier >>> 16);
buffer[6] = (byte) (identifier >>> 8);
buffer[7] = (byte) identifier;
outputStream.write(buffer);
structureMotifCounter.incrementAndGet();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
});
logger.info("[{}] [{}] Wrote {} residue pairs in {} ms",
context.partitionContext,
structureContext,
structureMotifCounter.get(),
(System.nanoTime() - startWrite) / 1000 / 1000);
} catch (Exception e) {
logger.error("[{}] [{}] Residue graph determination failed",
context.partitionContext,
structureContext,
e);
// fail complete update
throw new RuntimeException("Residue graph determination failed for " + structureIdentifier, e);
}
}
/**
* Acquire an input stream for the requested item. Simple case is a 4-character PDB-ID. Might also be a URL.
* @param item request
* @param context context for logging purposes
* @throws IOException when reading fails
* @return an InputStream
*/
protected InputStream handleInputStream(UpdateItem item, Context context) throws IOException {
URL url = item.getUrl();
if (url != null) {
logger.info("[{}] Processing {}", item.getStructureIdentifier(), url);
return url.openStream();
}
return structureDataProvider.getOriginalInputStream(item.getStructureIdentifier());
}
/**
* 'REMOVE' operation.
* @param identifierList set of identifiers to remove
*/
public void remove(Collection identifierList) {
Set identifiers = new HashSet<>(identifierList);
// mark everything that will be touched as dirty in case this operation fails
stateRepository.insertDirty(identifiers);
// modifying the renumbered file bundle is expensive and is done as batch
logger.info("Removing {} renumbered structure", identifiers.size());
structureDataProvider.deleteRenumbered(identifiers);
logger.debug("Done removing renumbered structure");
// modifying the inverted index is expensive and is done as batch
if (!identifiers.isEmpty()) {
Set mapped = identifiers.stream()
.filter(structureIndexProvider::containsKey)
.map(structureIndexProvider::selectStructureIndex)
.collect(Collectors.toSet());
if (!mapped.isEmpty()) {
invertedIndex.delete(mapped);
}
}
logger.info("Updating holdings");
stateRepository.deleteKnown(identifiers);
stateRepository.deleteDirty(identifiers);
logger.info("Finished removal operation");
}
/**
* Reports all structures currently present in the PDB archive.
* @param mode what data (pdb/csm) to include
* @return collection of update items
* @throws IOException connection failure
*/
public List getAllIdentifiers(String mode) throws IOException {
logger.info("Retrieving current {} entry list from RCSB PDB Search API", mode);
URL url = composeSearchUrl(mode);
logger.info("URL: {}", url);
List out = new ArrayList<>();
try (InputStream inputStream = url.openStream()) {
JsonElement jsonElement = new Gson().fromJson(new InputStreamReader(inputStream), JsonElement.class);
JsonObject jsonObject = jsonElement.getAsJsonObject();
jsonObject.getAsJsonArray("result_set")
.forEach(id -> out.add(new UpdateItem(id.getAsString())));
}
return out;
}
private URL composeSearchUrl(String mode) throws MalformedURLException {
String types = switch (mode) {
case FULL -> "\"experimental\"";
case FULL_CSM -> "\"computational\", \"experimental\"";
default -> throw new UnsupportedOperationException(mode + " is not yet implemented");
};
String query = URLEncoder.encode("""
{
"query": {
"type": "terminal",
"label": "text",
"service": "text",
"parameters": {
"attribute": "rcsb_entry_container_identifiers.entry_id",
"operator": "exists",
"negation": false
}
},
"return_type": "entry",
"request_options": {
"results_content_type": [{types}],
"return_all_hits": true,
"results_verbosity": "compact"
}
}
""".replace("{types}", types), StandardCharsets.UTF_8);
return new URL("https://search.rcsb.org/rcsbsearch/v2/query?json=" + query);
}
/**
* Determine all IDs that need to be added to the archive.
* @param requested the requested update
* @return UpdateItem collection that need to be processed for the given context
*/
public List getDeltaPlusIdentifiers(List requested) {
Set known = getKnown();
if (known.isEmpty()) {
logger.warn("No existing data - Starting from scratch");
return requested;
} else {
return requested.stream()
.filter(item -> !known.contains(item.getStructureIdentifier()))
.toList();
}
}
/**
* Determine all IDs that need to be removed from the archive.
* @param requested the requested update
* @return UpdateItem collection that need to be removed for the given context
*/
public List getDeltaMinusIdentifiers(List requested) {
Set known = getKnown();
if (known.isEmpty()) {
logger.warn("No existing data - no need for cleanup of obsolete entries");
return Collections.emptyList();
} else {
Set unwrapped = requested.stream()
.map(UpdateItem::getStructureIdentifier)
.collect(Collectors.toSet());
return known.stream()
.filter(unwrapped::contains)
.toList();
}
}
private Set getKnown() {
return stateRepository.selectKnown()
.stream()
.map(StructureInformation::structureIdentifier)
.collect(Collectors.toSet());
}
private void recover(Set dirty) {
remove(dirty);
// this will happen when writing to the inverted index fails: then bins can be corrupted and filled with structure indices that point nowhere
logger.info("Screening for lingering structures in the index");
Set knownToIndex = invertedIndex.reportKnownKeys();
Set knownToState = stateRepository.reportKnownKeys();
Set lingeringInIndex = knownToIndex.stream()
.filter(i -> !knownToState.contains(i))
.collect(Collectors.toSet());
if (!lingeringInIndex.isEmpty()) {
logger.info("{} lingering keys detected - removing...", lingeringInIndex.size());
invertedIndex.delete(lingeringInIndex);
}
}
private void printUsage() {
System.out.println("Too few arguments");
System.out.println();
System.out.println("Usage: java -Xmx12G -jar update.jar operation ...");
System.out.println("Valid operation values: " + Arrays.toString(Operation.values()));
System.out.println("Optionally: list of entry ids - (no argument performs null operation, use single argument '" + FULL + "' for complete update based on RCSB PDB holdings, use '" + FULL_CSM + "' to include computed structure models from AlphaFold DB)");
System.out.println("If you want to update entries you have to explicitly remove them first");
System.out.println();
System.out.println("Example: java -Xmx12G -jar update.jar ADD 1acj 1exr 4hhb");
System.out.println("Example: java -Xmx12G -jar update.jar ADD path /opt/data/pdb/");
System.out.println();
System.out.println("You can also provide URLs to index non-archived CIF files, in that case you must provide a unique, preferably namespaced identifier which will be used to index this item");
System.out.println("Example: java -Xmx12G -jar update.jar ADD AF-Q76EI6-F1,https://alphafold.ebi.ac.uk/files/AF-Q76EI6-F1-model_v1.cif MA-9Z55Z,file:///path/to/ma-9z55z.cif");
}
private Operation parseOperation(String[] args) {
return Operation.resolve(args[0]);
}
private static final String PDB_REGEX = "^[1-9][a-zA-Z0-9]{3}|PDB_[a-zA-Z0-9]{8}$";
private static final String CSM_REGEX = "^[a-zA-Z0-9]+_[a-zA-Z0-9]{6,}$"; // pattern used by rcsb.org for computed structure models
private List parseUpdateList(String[] args) throws IOException {
int offset = 1;
String[] ids = new String[args.length - offset];
List requested;
System.arraycopy(args, offset, ids, 0, ids.length);
if (ids.length == offset && (ids[0].equalsIgnoreCase(FULL) || ids[0].equalsIgnoreCase(FULL_CSM))) {
requested = getAllIdentifiers(ids[0].toLowerCase());
} else if (ids.length == offset + 1 && ids[0].equalsIgnoreCase("path")) {
try (Stream paths = Files.walk(Paths.get(ids[offset]))) {
requested = paths.filter(path -> STRUCTURE_EXTENSIONS.stream().anyMatch(ext -> path.toFile().getName().toLowerCase().endsWith(ext)))
.map(this::mapFile)
// can't be toList() as shuffle will happen downstream
.collect(Collectors.toList());
}
} else {
requested = Arrays.stream(ids)
// upper-case PDB-IDs, leave URLs be
.map(id -> {
String[] split = id.split(",");
if (id.matches(PDB_REGEX) || id.matches(CSM_REGEX)) {
return new UpdateItem(id.toUpperCase());
} else if (split.length == 2) {
try {
String key = split[0].toUpperCase();
URL url = new URL(split[1]);
return new UpdateItem(key, url);
} catch (MalformedURLException e) {
throw new IllegalArgumentException("Cannot parse line: '" + id + "' - not a valid URL");
}
} else {
throw new IllegalArgumentException("Cannot parse line: '" + id + "' - format is '${4-digit-entryId}' or '${identifier},${url}'");
}
})
// can't be toList() as shuffle will happen downstream
.collect(Collectors.toList());
}
Collections.shuffle(requested);
return requested;
}
private void deletePartialFiles() throws IOException {
try (Stream paths = Files.list(Paths.get(strucmotifConfig.getRootPath()))) {
paths.filter(p -> p.getFileName().toString().startsWith(StrucmotifConfig.INDEX) && p.getFileName().endsWith(StrucmotifConfig.TMP_EXT))
.map(Path::toFile)
.forEach(File::delete);
}
}
}