All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.metaeffekt.artifact.resolver.deb.index.pool.UbuntuPoolIndex Maven / Gradle / Ivy

package org.metaeffekt.artifact.resolver.deb.index.pool;

import lombok.Getter;
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.document.Document;
import org.metaeffekt.artifact.resolver.download.WebAccess;
import org.metaeffekt.artifact.resolver.generic.index.lucene.SimpleLuceneIndex;
import org.metaeffekt.artifact.resolver.generic.utils.GenericUtils;
import org.metaeffekt.artifact.resolver.generic.utils.MarkerUtils;
import org.metaeffekt.artifact.resolver.model.DownloadLocation;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

/**
 * An index from filenames to folders that contains said file.
 * 
* The ubuntu archives do this cool thing where they give us output from ls -lR, which we can use to build an index. */ @Slf4j public class UbuntuPoolIndex { private final DownloadLocation downloadLocation; private final WebAccess webAccess; @Getter private final String ubuntuPoolDirUrl; private final String ubuntuPoolDirUrlWithoutLastPathElement; private final AtomicBoolean initialized = new AtomicBoolean(false); private SimpleLuceneIndex simpleLuceneIndex = null; private static final String KEY_DIRNAME = "dirname"; private static final String KEY_FILENAMES = "filenames"; private static final int MAX_HITS = 64; private static final Pattern fileExtensionPattern = Pattern.compile("\\.[a-zA-Z]{1,16}$"); private static final Pattern newDirectoryListingPattern = Pattern.compile("^(?=(\\./.*:$|\\.:$))", Pattern.MULTILINE); public UbuntuPoolIndex(@NonNull DownloadLocation downloadLocation, @NonNull WebAccess webAccess, @NonNull String ubuntuPoolDirUrl) { this.downloadLocation = downloadLocation; this.webAccess = webAccess; this.ubuntuPoolDirUrl = ubuntuPoolDirUrl; this.ubuntuPoolDirUrlWithoutLastPathElement = removeLastPathElement(ubuntuPoolDirUrl); } private static String removeLastPathElement(String urlPath) { final String noTrailingSlash = urlPath.replaceAll("/*$", ""); int index = noTrailingSlash.lastIndexOf("/"); if (index <= 0) { log.debug("Can't create ubuntu pool index from url [{}].", urlPath); } return urlPath.substring(0, index); } private void init() throws IOException { final String lsFileUrl = removeLastPathElement(ubuntuPoolDirUrl) + "/" + "ls-lR.gz"; final URL parsedUrl; try { parsedUrl = new URL(lsFileUrl); } catch (MalformedURLException e) { throw new RuntimeException(e); } final File destinationFile = new File( new File(downloadLocation.deriveDownloadFolder( "ubuntu-pool-index", parsedUrl.getHost()), "[" + parsedUrl.getHost() + "-" + urlToDownloadFilename(lsFileUrl) + "]"), urlToDownloadFilename(lsFileUrl) ); final File markerFile = MarkerUtils.deriveMarkerFileFromDestination(destinationFile); // special logic: force download after a day; don't keep old listings, they are likely out of date! if (markerFile.exists() && !GenericUtils.isModifiedInLast24Hours(markerFile)) { // force redownload if (destinationFile.exists() && !destinationFile.delete()) { log.warn("Could not delete destination file [{}]; index may not refresh.", destinationFile); } if (!markerFile.delete()) { log.warn("Could not delete marker file [{}]; index may not refresh.", markerFile); } } final File downloaded = MarkerUtils.attemptDownload(markerFile, destinationFile.getName()+ " of class " + this.getClass().getSimpleName() + ": " + ubuntuPoolDirUrl, () -> GenericUtils.downloadFile(webAccess, lsFileUrl, destinationFile, this.getClass().getSimpleName() + ": " + ubuntuPoolDirUrl)); if (downloaded == null) { log.debug("Index creation for pool [{}] aborting: listing download failed.", lsFileUrl); return; } final File luceneIndexLocation = new File(downloaded.getParentFile(), downloaded.getName() + "-lucene"); this.simpleLuceneIndex = new SimpleLuceneIndex(luceneIndexLocation); // clear index beforehand: just rebuild the index every time. this.simpleLuceneIndex.clear(); Map> entryMap = null; // parse file to construct index try (final InputStream inputStream = Files.newInputStream(downloaded.toPath()); final InputStream decompressed = new GZIPInputStream(inputStream, 65536); final Reader intermediateReader = new InputStreamReader(decompressed, StandardCharsets.UTF_8); final BufferedReader reader = new BufferedReader(intermediateReader, 65536)) { String line = null; while (true) { // read next line line = reader.readLine(); if (line == null) { // end of file break; } // blank lines are meaningless to us if (StringUtils.isBlank(line)) { continue; } // prepare line for parsing line = line.trim(); if (newDirectoryListingPattern.matcher(line).find()) { // this is a new directory part. write the old one and make a new part to be filled if (entryMap != null) { // this is not the first directory. write whatever we have collected so far and write it simpleLuceneIndex.addEntry(entryMap); } entryMap = new HashMap<>(); // cur off the colon at the end final String lsDirname = line.substring(0, line.length() - 1); entryMap.put(KEY_DIRNAME, Collections.singletonList(lsDirname)); continue; } // otherwise just add the file to the existing map if (entryMap == null) { log.debug("Unable to associate line with directory: line [{}].", line); continue; } // derive filename and add it final String[] lsOutputSplit = StringUtils.split(line); if (StringUtils.equalsIgnoreCase("total", lsOutputSplit[0]) && lsOutputSplit.length == 2) { // signifies the total number of files in the directory. useless, so skip it in indexing. continue; } final String filename = lsOutputSplit[lsOutputSplit.length - 1].trim(); if (StringUtils.isBlank(filename)) { log.trace("Got empty filename trying to add line [{}].", line); continue; } entryMap.computeIfAbsent(KEY_FILENAMES, key -> new LinkedHashSet<>()).add(filename); } } if (entryMap != null) { simpleLuceneIndex.addEntry(entryMap); } simpleLuceneIndex.commit(); } public void ensureInitialized() throws IOException { synchronized (initialized) { if (!initialized.get()) { log.debug("Initializing lazy index..."); init(); initialized.set(true); } // throw exception if luceneIndex was not set if (simpleLuceneIndex == null) { throw new IOException("Index is not properly initialized: lucene index is not available."); } } } private static String urlToDownloadFilename(String url) { // FIXME: storing hash-based seems like a hack given our current code infrastructure. is there a better way? final int lastDot = url.lastIndexOf('.'); final String uncleanEnding = lastDot != -1 ? url.substring(lastDot) : ""; final String clean = fileExtensionPattern.matcher(uncleanEnding).find() ? uncleanEnding : ""; return "sha256-" + DigestUtils.sha256Hex(url.getBytes(StandardCharsets.UTF_8)) + clean; } @NonNull public List getDirectoriesFromFilename(@NonNull String filename) throws IOException { ensureInitialized(); final List foundDocuments = simpleLuceneIndex.lookupContains(KEY_FILENAMES, filename, MAX_HITS); if (foundDocuments.isEmpty()) { log.trace("Found no documents for filename [{}] in ubuntu pool index [{}].", filename, this.ubuntuPoolDirUrl); return Collections.emptyList(); } if (foundDocuments.size() > 1) { log.debug("Got multiple hits for queried filename [{}].", filename); } if (foundDocuments.size() >= MAX_HITS) { log.debug("Max hits reached while searching ubuntu pool index [{}] for name [{}]. Might lose matches!", this.ubuntuPoolDirUrl, filename); } return foundDocuments.stream() .filter(Objects::nonNull) .map(doc -> doc.get(KEY_DIRNAME)) // only return pool urls from index .filter(dirInIndex -> dirInIndex.startsWith("./pool/")) // reconstruct path using partial path as per the position of the "ls-lR.gz" file .map(dirInIndex -> this.ubuntuPoolDirUrlWithoutLastPathElement + "/" + dirInIndex.replaceFirst("^\\./", "")) .collect(Collectors.toList()); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy