org.rcsb.strucmotif.update.StrucmotifUpdate Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of strucmotif-search-update Show documentation
There is a newer version: 0.19.8
Show newest version
package org.rcsb.strucmotif.update;

import org.rcsb.cif.CifIO;
import org.rcsb.cif.ParsingException;
import org.rcsb.cif.schema.StandardSchemata;
import org.rcsb.cif.schema.mm.MmCifFile;
import org.rcsb.strucmotif.config.StrucmotifConfig;
import org.rcsb.strucmotif.core.ThreadPool;
import org.rcsb.strucmotif.domain.bucket.ResiduePairIdentifierBucket;
import org.rcsb.strucmotif.domain.motif.ResiduePairDescriptor;
import org.rcsb.strucmotif.domain.motif.ResiduePairIdentifier;
import org.rcsb.strucmotif.domain.structure.AssemblyInformation;
import org.rcsb.strucmotif.domain.structure.ResidueGraph;
import org.rcsb.strucmotif.domain.structure.Revision;
import org.rcsb.strucmotif.domain.structure.Structure;
import org.rcsb.strucmotif.domain.structure.StructureInformation;
import org.rcsb.strucmotif.io.InvertedIndex;
import org.rcsb.strucmotif.io.StateRepository;
import org.rcsb.strucmotif.io.StructureDataProvider;
import org.rcsb.strucmotif.io.StructureIndexProvider;
import org.rcsb.strucmotif.math.Partition;
import org.rcsb.strucmotif.update.extractor.KeyExtractorFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.data.mongo.MongoDataAutoConfiguration;
import org.springframework.boot.autoconfigure.domain.EntityScan;
import org.springframework.boot.autoconfigure.mongo.MongoAutoConfiguration;
import org.springframework.context.annotation.ComponentScan;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UncheckedIOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import static org.rcsb.strucmotif.domain.structure.ResidueGraph.ResidueGraphOptions.depositedAndContacts;

/**
 * Runs strucmotif updates from the command-line.
 */
@SpringBootApplication(exclude = { MongoAutoConfiguration.class, MongoDataAutoConfiguration.class })
@ComponentScan({"org.rcsb.strucmotif"})
@EntityScan("org.rcsb.strucmotif")
public class StrucmotifUpdate implements CommandLineRunner {
    private static final Logger logger = LoggerFactory.getLogger(StrucmotifUpdate.class);
    private static final Set STRUCTURE_EXTENSIONS = Set.of(".cif", ".cif.gz", ".bcif", ".bcif.gz");

    /**
     * Entry point from the command-line.
     * @param args command-line arguments
     */
    public static void main(String[] args) {
        SpringApplication.run(StrucmotifUpdate.class, args);
    }

    private final StateRepository stateRepository;
    private final StructureDataProvider structureDataProvider;
    private final InvertedIndex invertedIndex;
    private final StrucmotifConfig strucmotifConfig;
    private final ThreadPool threadPool;
    private final StructureIndexProvider structureIndexProvider;

    /**
     * Injectable constructor.
     * @param stateRepository the state repo
     * @param structureDataProvider data provider
     * @param invertedIndex inverted index
     * @param strucmotifConfig configs
     * @param threadPool thread pool
     * @param structureIndexProvider index provider
     */
    @Autowired
    public StrucmotifUpdate(StateRepository stateRepository, StructureDataProvider structureDataProvider, InvertedIndex invertedIndex, StrucmotifConfig strucmotifConfig, ThreadPool threadPool, StructureIndexProvider structureIndexProvider) {
        this.stateRepository = stateRepository;
        this.structureDataProvider = structureDataProvider;
        this.invertedIndex = invertedIndex;
        this.strucmotifConfig = strucmotifConfig;
        this.threadPool = threadPool;
        this.structureIndexProvider = structureIndexProvider;
    }

    /**
     * Actual run method with the given arguments.
     * @param args command-line arguments
     * @throws Exception update failure
     */
    public void run(String[] args) throws Exception {
        if (args.length < 1) {
            System.out.println("Too few arguments");
            System.out.println();
            System.out.println("Usage: java -Xmx12G -jar update.jar operation ...");
            System.out.println("Valid operation values: " + Arrays.toString(Operation.values()));
            System.out.println("Optionally: list of entry ids - (no argument performs null operation, use single argument 'full' for complete update)");
            System.out.println("If you want to update entries you have to explicitly remove them first");
            System.out.println();
            System.out.println("Example: java -Xmx12G -jar update.jar ADD 1acj 1exr 4hhb");
            System.out.println("Example: java -Xmx12G -jar update.jar ADD path /opt/data/pdb/");
            System.out.println();
            System.out.println("You can also provide URLs to index non-archived CIF files, in that case you must provide a unique, preferably namespaced identifier which will be used to index this item");
            System.out.println("Example: java -Xmx12G -jar update.jar ADD AF-Q76EI6-F1,https://alphafold.ebi.ac.uk/files/AF-Q76EI6-F1-model_v1.cif MA-9Z55Z,file:///path/to/ma-9z55z.cif");
            return;
        }

        // determine identifiers requested by user - either provided collection or all currently reported identifiers by RCSB PDB
        Operation operation = Operation.resolve(args[0]);
        String[] ids = new String[args.length - 1];
        List requested;
        System.arraycopy(args, 1, ids, 0, ids.length);
        if (ids.length == 1 && ids[0].equalsIgnoreCase("full")) {
            requested = getAllIdentifiers();
        } else if (ids.length == 2 && ids[0].equalsIgnoreCase("path")) {
            requested = Files.walk(Paths.get(ids[1]))
                    .filter(path -> STRUCTURE_EXTENSIONS.stream().anyMatch(ext -> path.toFile().getName().toLowerCase().endsWith(ext)))
                    .map(this::mapFile)
                    .collect(Collectors.toList());
        } else {
            requested = Arrays.stream(ids)
                    // upper-case PDB-IDs, leave URLs be
                    .map(id -> {
                        String[] split = id.split(",");
                        if (id.length() == 4) {
                            return new UpdateItem(id.toUpperCase());
                        } else if (split.length == 2) {
                            try {
                                String key = split[0].toUpperCase();
                                URL url = new URL(split[1]);
                                return new UpdateItem(key, url);
                            } catch (MalformedURLException e) {
                                throw new IllegalArgumentException("Cannot parse line: '" + id + "' - not a valid URL");
                            }
                        } else {
                            throw new IllegalArgumentException("Cannot parse line: '" + id + "' - format is '${4-digit-entryId}' or '${identifier},${url}'");
                        }
                    })
                    .collect(Collectors.toList());
        }
        Collections.shuffle(requested);

        // check for sanity of internal state
        if (operation != Operation.RECOVER) {
            Collection dirtyStructureIdentifiers = stateRepository.selectDirty();
            if (dirtyStructureIdentifiers.size() > 0) {
                logger.warn("Update state is dirty - Problematic identifiers:\n{}",
                        dirtyStructureIdentifiers);
                logger.info("Recovering from dirty state");
                recover(dirtyStructureIdentifiers);
            }
        }

        logger.info("Starting update - Operation: {}, {} ids ({})",
                operation,
                requested.size(),
                requested.stream()
                        .limit(5)
                        .map(item -> "\"" + item.getStructureIdentifier() + "\"")
                        .collect(Collectors.joining(", ", "[",  requested.size() > 5 ? ", ...]" : "]")));

        switch (operation) {
            case ADD:
                add(getDeltaPlusIdentifiers(requested));
                break;
            case REMOVE:
                remove(getDeltaMinusIdentifiers(requested));
                break;
            case RECOVER:
                recover(stateRepository.selectDirty());
                break;
        }

        logger.info("Finished update operation");
    }

    private UpdateItem mapFile(Path path) {
        try {
            return new UpdateItem(KeyExtractorFactory.getKey(path.toFile().getName()), path.toUri().toURL());
        } catch (MalformedURLException e) {
            throw new UncheckedIOException(e);
        }
    }

    /**
     * The 'ADD' operation.
     * @param items set of UpdateItems to add
     * @throws ExecutionException update failure
     * @throws InterruptedException update failure
     */
    public void add(Collection items) throws ExecutionException, InterruptedException {
        long target = items.size();
        logger.info("{} files to process in total", target);

        Partition partitions = new Partition<>(items, strucmotifConfig.getUpdateChunkSize());
        logger.info("Formed {} partitions of {} structures",
                partitions.size(),
                strucmotifConfig.getUpdateChunkSize());

        Set known = stateRepository.selectKnown()
                .stream()
                .map(StructureInformation::getStructureIdentifier)
                .collect(Collectors.toSet());
        Context context = new Context(known);

        // split into partitions and process
        for (int i = 0; i < partitions.size(); i++) {
            context.partitionContext = (i + 1) + " / " + partitions.size();

            List partition = partitions.get(i);
            logger.info("[{}] Start processing partition", context.partitionContext);

            context.structureCounter = new AtomicInteger();
            context.buffer = new ConcurrentHashMap<>();
            threadPool.submit(() -> {
                partition.parallelStream().forEach(item -> handleUpdateItem(item, context));
                return null;
            }).get();

            // mark as dirty only around index update
            Set dirty = context.processed.stream()
                    .map(StructureInformation::getStructureIdentifier)
                    // ignore items detected as known after reading entry.id (happens when processing URLs)
                    .filter(id -> !known.contains(id))
                    .collect(Collectors.toSet());
            stateRepository.insertDirty(dirty);
            persist(context);
        }
    }

    private void handleUpdateItem(UpdateItem item, Context context) {
        int maxRetries = strucmotifConfig.getDownloadTries();
        for (int i = 1; i <= maxRetries; i++) {
            try {
                handleUpdateItemInternal(item, context);
                break;
            } catch (UncheckedIOException e) {
                if (i >= maxRetries) {
                    // max retries exceeded
                    throw e;
                }

                int count = context.structureCounter.get();
                String source = item.getUrl() != null ? item.getUrl().toString() : item.getStructureIdentifier();
                String structureContext = count + " / " + strucmotifConfig.getUpdateChunkSize() + "] [" + source;
                logger.warn("[{}] [{}] [try: {} / {}] Failed to download source file - {}",
                        context.partitionContext,
                        structureContext,
                        i,
                        maxRetries,
                        e.getMessage());
            }
        }
    }

    private void handleUpdateItemInternal(UpdateItem item, Context context) {
        String structureIdentifier = item.getStructureIdentifier();
        int structureIndex = structureIndexProvider.nextStructureIndex();
        try {
            InputStream inputStream = handleInputStream(item, context);

            // get some clean metadata
            MmCifFile mmCifFile = CifIO.readFromInputStream(inputStream).as(StandardSchemata.MMCIF);

            boolean hasRevision = mmCifFile.getFirstBlock().getPdbxAuditRevisionHistory().isDefined();
            // if revision isn't set (happens e.g. for ModelArchive files) then set to 1.0 by default
            Revision revision = hasRevision ? new Revision(mmCifFile) : new Revision(1, 0);
            Map assemblyInformation = AssemblyInformation.of(mmCifFile);

            // write renumbered structure
            structureDataProvider.writeRenumbered(structureIdentifier, mmCifFile);
            context.processed.add(new StructureInformation(structureIdentifier, structureIndex, revision, assemblyInformation));
        } catch (IOException e) {
            throw new UncheckedIOException("Cif parsing failed for " + structureIdentifier, e);
        } catch (ParsingException e) {
            logger.warn("Cif parsing failed for " + structureIdentifier, e);
            throw e;
        }

        int count = context.structureCounter.incrementAndGet();
        String structureContext = count + " / " + strucmotifConfig.getUpdateChunkSize() + "] [" + structureIdentifier;

        // fails when file is missing (should not happen) or does not contain valid polymer chain
        Structure structure;
        try {
            structure = structureDataProvider.readRenumbered(structureIdentifier);
        } catch (UncheckedIOException e) {
            logger.warn("[{}] [{}] No valid polymer chain(s) - Skipping",
                    context.partitionContext,
                    structureContext);
            return;
        }

        if (!strucmotifConfig.isUndefinedAssemblies() && structure.getAssemblies().isEmpty()) {
            logger.warn("[{}] [{}] No assembly information - Configured to skip",
                    context.partitionContext,
                    structureContext);
            return;
        }

        try {
            ResidueGraph residueGraph = new ResidueGraph(structure, strucmotifConfig, depositedAndContacts());

            // extract motifs
            AtomicInteger structureMotifCounter = new AtomicInteger();
            threadPool.submit(() -> {
                residueGraph.residuePairOccurrencesParallel()
                        .forEach(motifOccurrence -> {
                            ResiduePairDescriptor motifDescriptor = motifOccurrence.getResiduePairDescriptor();
                            ResiduePairIdentifier targetIdentifier = motifOccurrence.getResidueIdentifier();

                            Map> groupedTargetIdentifiers = context.buffer.computeIfAbsent(motifDescriptor, k -> Collections.synchronizedMap(new HashMap<>()));
                            Collection targetIdentifiers = groupedTargetIdentifiers.computeIfAbsent(structureIndex, k -> Collections.synchronizedSet(new HashSet<>()));
                            targetIdentifiers.add(targetIdentifier);
                            structureMotifCounter.incrementAndGet();
                        });
                return null;
            }).get();
            logger.info("[{}] [{}] Extracted {} residue pairs",
                    context.partitionContext,
                    structureContext,
                    structureMotifCounter.get());
        } catch (Exception e) {
            logger.warn("[{}] [{}] Residue graph determination failed",
                    context.partitionContext,
                    structureContext,
                    e);
            // fail complete update
            throw new RuntimeException(e);
        }
    }

    /**
     * Acquire an input stream for the requested item. Simple case is a 4-character PDB-ID. Might also be a URL.
     * @param item request
     * @param context context for logging purposes
     * @throws IOException when reading fails
     * @return an InputStream
     */
    protected InputStream handleInputStream(UpdateItem item, Context context) throws IOException {
        URL url = item.getUrl();
        if (url != null) {
            logger.info("[{}] [{}] Processing {}", context.partitionContext, item.getStructureIdentifier(), url);
            return url.openStream();
        }

        return structureDataProvider.getOriginalInputStream(item.getStructureIdentifier());
    }

    private void persist(Context context) throws ExecutionException, InterruptedException {
        logger.info("[{}] Persisting {} unique residue pair descriptors",
                context.partitionContext,
                context.buffer.size());

        final int bufferTotal = context.buffer.size();
        AtomicInteger bufferCount = new AtomicInteger();
        threadPool.submit(() -> {
            context.buffer.entrySet().parallelStream().forEach(entry -> {
                ResiduePairDescriptor key = entry.getKey();
                ResiduePairIdentifierBucket output = new ResiduePairIdentifierBucket(entry.getValue());

                if (bufferCount.incrementAndGet() % 10000 == 0) {
                    logger.info("[{}] {} / {}",
                            context.partitionContext,
                            bufferCount,
                            bufferTotal);
                }

                invertedIndex.insert(key, output);
            });
            return null;
        }).get();

        context.buffer.clear();

        // processed contains all StructureIdentifiers + corresponding revision
        stateRepository.insertKnown(context.processed);
        stateRepository.deleteDirty(context.processed.stream().map(StructureInformation::getStructureIdentifier).collect(Collectors.toSet()));
        context.processed.clear();
    }

    /**
     * 'REMOVE' operation.
     * @param identifiers set of identifiers to remove
     */
    public void remove(Collection identifiers) {
        // mark everything that will be touched as dirty in case this operation fails
        stateRepository.insertDirty(identifiers);

        AtomicInteger counter = new AtomicInteger();
        for (String structureIdentifier : identifiers) {
            logger.info("[{}] Removing renumbered structure for entry: {}",
                    counter.incrementAndGet() + " / " + identifiers.size(),
                    structureIdentifier);
            structureDataProvider.deleteRenumbered(structureIdentifier);
        }

        // inverted index is expensive and should be done as batch
        if (identifiers.size() > 0) {
            Set mapped = identifiers.stream()
                    .filter(structureIndexProvider::containsKey)
                    .map(structureIndexProvider::selectStructureIndex)
                    .collect(Collectors.toSet());
            if (mapped.size() > 0) {
                invertedIndex.delete(mapped);
            }
        }

        stateRepository.deleteKnown(identifiers);
        stateRepository.deleteDirty(identifiers);
        logger.info("Finished removal operation");
    }

    private static final Pattern ENTRY_ID_PATTERN = Pattern.compile("[0-9][0-9A-Z]{3}");

    /**
     * Reports all structures currently present in the PDB archive.
     * @return collection of update items
     * @throws IOException connection failure
     */
    public List getAllIdentifiers() throws IOException {
        logger.info("Retrieving current entry list from {}", StrucmotifConfig.RCSB_ENTRY_LIST);
        String response;
        try (InputStream inputStream = new URL(StrucmotifConfig.RCSB_ENTRY_LIST).openStream()) {
            try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream))) {
                response = bufferedReader.lines().collect(Collectors.joining(System.lineSeparator()));
            }
        }
        return ENTRY_ID_PATTERN.matcher(response)
                .results()
                .map(MatchResult::group)
                .map(String::toUpperCase)
                .map(UpdateItem::new)
                .collect(Collectors.toList());
    }

    /**
     * Determine all IDs that need to be added to the archive.
     * @param requested the requested update
     * @return UpdateItem collection that need to be processed for the given context
     */
    public Collection getDeltaPlusIdentifiers(Collection requested) {
        Collection known = getKnown();
        if (known.isEmpty()) {
            logger.warn("No existing data - Starting from scratch");
            return requested;
        } else {
            return requested.stream()
                    .filter(item -> !known.contains(item.getStructureIdentifier()))
                    .collect(Collectors.toSet());
        }
    }

    /**
     * Determine all IDs that need to be removed from the archive.
     * @param requested the requested update
     * @return UpdateItem collection that need to be removed for the given context
     */
    public Collection getDeltaMinusIdentifiers(Collection requested) {
        Collection known = getKnown();
        if (known.isEmpty()) {
            logger.warn("No existing data - no need for cleanup of obsolete entries");
            return Collections.emptySet();
        } else {
            Collection unwrapped = requested.stream()
                    .map(UpdateItem::getStructureIdentifier)
                    .collect(Collectors.toSet());
            return known.stream()
                    .filter(unwrapped::contains)
                    .collect(Collectors.toSet());
        }
    }

    private Collection getKnown() {
        return stateRepository.selectKnown()
                .stream()
                .map(StructureInformation::getStructureIdentifier)
                .collect(Collectors.toSet());
    }

    private void recover(Collection dirty) {
        remove(dirty);

        // this will happen when writing to the inverted index fails: then bins can be corrupted and filled with structure indices that point nowhere
        logger.info("Screening for lingering structures in the index");
        Set knownToIndex = invertedIndex.reportKnownKeys();
        Set knownToState = stateRepository.reportKnownKeys();
        Set lingeringInIndex = knownToIndex.stream()
                .filter(i -> !knownToState.contains(i))
                .collect(Collectors.toSet());
        if (lingeringInIndex.size() > 0) {
            logger.info("{} lingering keys detected - removing...", lingeringInIndex.size());
            invertedIndex.delete(lingeringInIndex);
        }
    }
}