org.metaeffekt.artifact.resolver.deb.index.pool.UbuntuPoolIndex Maven / Gradle / Ivy

Go to download
package org.metaeffekt.artifact.resolver.deb.index.pool;

import lombok.Getter;
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.document.Document;
import org.metaeffekt.artifact.resolver.download.WebAccess;
import org.metaeffekt.artifact.resolver.generic.index.lucene.SimpleLuceneIndex;
import org.metaeffekt.artifact.resolver.generic.utils.GenericUtils;
import org.metaeffekt.artifact.resolver.generic.utils.MarkerUtils;
import org.metaeffekt.artifact.resolver.model.DownloadLocation;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

/**
 * An index from filenames to folders that contains said file.
 * 

 * The ubuntu archives do this cool thing where they give us output from ls -lR, which we can use to build an index.
 */
@Slf4j
public class UbuntuPoolIndex {
    private final DownloadLocation downloadLocation;
    private final WebAccess webAccess;
    @Getter
    private final String ubuntuPoolDirUrl;
    private final String ubuntuPoolDirUrlWithoutLastPathElement;

    private final AtomicBoolean initialized = new AtomicBoolean(false);
    private SimpleLuceneIndex simpleLuceneIndex = null;

    private static final String KEY_DIRNAME = "dirname";
    private static final String KEY_FILENAMES = "filenames";

    private static final int MAX_HITS = 64;

    private static final Pattern fileExtensionPattern = Pattern.compile("\\.[a-zA-Z]{1,16}$");
    private static final Pattern newDirectoryListingPattern = Pattern.compile("^(?=(\\./.*:$|\\.:$))",
            Pattern.MULTILINE);

    public UbuntuPoolIndex(@NonNull DownloadLocation downloadLocation,
                           @NonNull WebAccess webAccess,
                           @NonNull String ubuntuPoolDirUrl) {
        this.downloadLocation = downloadLocation;
        this.webAccess = webAccess;
        this.ubuntuPoolDirUrl = ubuntuPoolDirUrl;
        this.ubuntuPoolDirUrlWithoutLastPathElement = removeLastPathElement(ubuntuPoolDirUrl);
    }

    private static String removeLastPathElement(String urlPath) {
        final String noTrailingSlash = urlPath.replaceAll("/*$", "");
        int index = noTrailingSlash.lastIndexOf("/");
        if (index <= 0) {
            log.debug("Can't create ubuntu pool index from url [{}].", urlPath);
        }

        return urlPath.substring(0, index);
    }

    private void init() throws IOException {
        final String lsFileUrl = removeLastPathElement(ubuntuPoolDirUrl) + "/" + "ls-lR.gz";

        final URL parsedUrl;
        try {
            parsedUrl = new URL(lsFileUrl);
        } catch (MalformedURLException e) {
            throw new RuntimeException(e);
        }

        final File destinationFile = new File(
                new File(downloadLocation.deriveDownloadFolder(
                        "ubuntu-pool-index",
                        parsedUrl.getHost()),
                        "[" + parsedUrl.getHost() + "-" + urlToDownloadFilename(lsFileUrl) + "]"),
                urlToDownloadFilename(lsFileUrl)
        );

        final File markerFile = MarkerUtils.deriveMarkerFileFromDestination(destinationFile);

        // special logic: force download after a day; don't keep old listings, they are likely out of date!
        if (markerFile.exists() && !GenericUtils.isModifiedInLast24Hours(markerFile)) {
            // force redownload
            if (destinationFile.exists() && !destinationFile.delete()) {
                log.warn("Could not delete destination file [{}]; index may not refresh.", destinationFile);
            }
            if (!markerFile.delete()) {
                log.warn("Could not delete marker file [{}]; index may not refresh.", markerFile);
            }
        }

        final File downloaded = MarkerUtils.attemptDownload(markerFile,
                destinationFile.getName()+ " of class " + this.getClass().getSimpleName() + ": " + ubuntuPoolDirUrl,
                () -> GenericUtils.downloadFile(webAccess,
                        lsFileUrl,
                        destinationFile,
                        this.getClass().getSimpleName() + ": " + ubuntuPoolDirUrl));

        if (downloaded == null) {
            log.debug("Index creation for pool [{}] aborting: listing download failed.", lsFileUrl);
            return;
        }

        final File luceneIndexLocation = new File(downloaded.getParentFile(), downloaded.getName() + "-lucene");
        this.simpleLuceneIndex = new SimpleLuceneIndex(luceneIndexLocation);
        // clear index beforehand: just rebuild the index every time.
        this.simpleLuceneIndex.clear();

        Map> entryMap = null;

        // parse file to construct index
        try (final InputStream inputStream = Files.newInputStream(downloaded.toPath());
             final InputStream decompressed = new GZIPInputStream(inputStream, 65536);
             final Reader intermediateReader = new InputStreamReader(decompressed, StandardCharsets.UTF_8);
             final BufferedReader reader = new BufferedReader(intermediateReader, 65536)) {
            String line = null;
            while (true) {
                // read next line
                line = reader.readLine();

                if (line == null) {
                    // end of file
                    break;
                }

                // blank lines are meaningless to us
                if (StringUtils.isBlank(line)) {
                    continue;
                }

                // prepare line for parsing
                line = line.trim();

                if (newDirectoryListingPattern.matcher(line).find()) {
                    // this is a new directory part. write the old one and make a new part to be filled

                    if (entryMap != null) {
                        // this is not the first directory. write whatever we have collected so far and write it
                        simpleLuceneIndex.addEntry(entryMap);
                    }

                    entryMap = new HashMap<>();

                    // cur off the colon at the end
                    final String lsDirname = line.substring(0, line.length() - 1);
                    entryMap.put(KEY_DIRNAME, Collections.singletonList(lsDirname));

                    continue;
                }

                // otherwise just add the file to the existing map

                if (entryMap == null) {
                    log.debug("Unable to associate line with directory: line [{}].", line);
                    continue;
                }

                // derive filename and add it

                final String[] lsOutputSplit = StringUtils.split(line);
                if (StringUtils.equalsIgnoreCase("total", lsOutputSplit[0]) && lsOutputSplit.length == 2) {
                    // signifies the total number of files in the directory. useless, so skip it in indexing.
                    continue;
                }


                final String filename = lsOutputSplit[lsOutputSplit.length - 1].trim();
                if (StringUtils.isBlank(filename)) {
                    log.trace("Got empty filename trying to add line [{}].", line);
                    continue;
                }
                entryMap.computeIfAbsent(KEY_FILENAMES, key -> new LinkedHashSet<>()).add(filename);
            }
        }

        if (entryMap != null) {
            simpleLuceneIndex.addEntry(entryMap);
        }

        simpleLuceneIndex.commit();
    }

    public void ensureInitialized() throws IOException {
        synchronized (initialized) {
            if (!initialized.get()) {
                log.debug("Initializing lazy index...");
                init();
                initialized.set(true);
            }

            // throw exception if luceneIndex was not set
            if (simpleLuceneIndex == null) {
                throw new IOException("Index is not properly initialized: lucene index is not available.");
            }
        }
    }

    private static String urlToDownloadFilename(String url) {
        // FIXME: storing hash-based seems like a hack given our current code infrastructure. is there a better way?
        final int lastDot = url.lastIndexOf('.');
        final String uncleanEnding = lastDot != -1 ? url.substring(lastDot) : "";
        final String clean = fileExtensionPattern.matcher(uncleanEnding).find() ? uncleanEnding : "";
        return "sha256-" + DigestUtils.sha256Hex(url.getBytes(StandardCharsets.UTF_8)) + clean;
    }

    @NonNull
    public List getDirectoriesFromFilename(@NonNull String filename) throws IOException {
        ensureInitialized();

        final List foundDocuments = simpleLuceneIndex.lookupContains(KEY_FILENAMES, filename, MAX_HITS);

        if (foundDocuments.isEmpty()) {
            log.trace("Found no documents for filename [{}] in ubuntu pool index [{}].",
                    filename,
                    this.ubuntuPoolDirUrl);
            return Collections.emptyList();
        }
        if (foundDocuments.size() > 1) {
            log.debug("Got multiple hits for queried filename [{}].", filename);
        }

        if (foundDocuments.size() >= MAX_HITS) {
            log.debug("Max hits reached while searching ubuntu pool index [{}] for name [{}]. Might lose matches!",
                    this.ubuntuPoolDirUrl,
                    filename);
        }

        return foundDocuments.stream()
                .filter(Objects::nonNull)
                .map(doc -> doc.get(KEY_DIRNAME))
                // only return pool urls from index
                .filter(dirInIndex -> dirInIndex.startsWith("./pool/"))
                // reconstruct path using partial path as per the position of the "ls-lR.gz" file
                .map(dirInIndex -> this.ubuntuPoolDirUrlWithoutLastPathElement + "/" +
                        dirInIndex.replaceFirst("^\\./", ""))
                .collect(Collectors.toList());
    }
}