org.metaeffekt.artifact.resolver.deb.index.pool.UbuntuPoolIndex Maven / Gradle / Ivy
package org.metaeffekt.artifact.resolver.deb.index.pool;
import lombok.Getter;
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.document.Document;
import org.metaeffekt.artifact.resolver.download.WebAccess;
import org.metaeffekt.artifact.resolver.generic.index.lucene.SimpleLuceneIndex;
import org.metaeffekt.artifact.resolver.generic.utils.GenericUtils;
import org.metaeffekt.artifact.resolver.generic.utils.MarkerUtils;
import org.metaeffekt.artifact.resolver.model.DownloadLocation;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
/**
* An index from filenames to folders that contains said file.
*
* The ubuntu archives do this cool thing where they give us output from ls -lR, which we can use to build an index.
*/
@Slf4j
public class UbuntuPoolIndex {
private final DownloadLocation downloadLocation;
private final WebAccess webAccess;
@Getter
private final String ubuntuPoolDirUrl;
private final String ubuntuPoolDirUrlWithoutLastPathElement;
private final AtomicBoolean initialized = new AtomicBoolean(false);
private SimpleLuceneIndex simpleLuceneIndex = null;
private static final String KEY_DIRNAME = "dirname";
private static final String KEY_FILENAMES = "filenames";
private static final int MAX_HITS = 64;
private static final Pattern fileExtensionPattern = Pattern.compile("\\.[a-zA-Z]{1,16}$");
private static final Pattern newDirectoryListingPattern = Pattern.compile("^(?=(\\./.*:$|\\.:$))",
Pattern.MULTILINE);
public UbuntuPoolIndex(@NonNull DownloadLocation downloadLocation,
@NonNull WebAccess webAccess,
@NonNull String ubuntuPoolDirUrl) {
this.downloadLocation = downloadLocation;
this.webAccess = webAccess;
this.ubuntuPoolDirUrl = ubuntuPoolDirUrl;
this.ubuntuPoolDirUrlWithoutLastPathElement = removeLastPathElement(ubuntuPoolDirUrl);
}
private static String removeLastPathElement(String urlPath) {
final String noTrailingSlash = urlPath.replaceAll("/*$", "");
int index = noTrailingSlash.lastIndexOf("/");
if (index <= 0) {
log.debug("Can't create ubuntu pool index from url [{}].", urlPath);
}
return urlPath.substring(0, index);
}
private void init() throws IOException {
final String lsFileUrl = removeLastPathElement(ubuntuPoolDirUrl) + "/" + "ls-lR.gz";
final URL parsedUrl;
try {
parsedUrl = new URL(lsFileUrl);
} catch (MalformedURLException e) {
throw new RuntimeException(e);
}
final File destinationFile = new File(
new File(downloadLocation.deriveDownloadFolder(
"ubuntu-pool-index",
parsedUrl.getHost()),
"[" + parsedUrl.getHost() + "-" + urlToDownloadFilename(lsFileUrl) + "]"),
urlToDownloadFilename(lsFileUrl)
);
final File markerFile = MarkerUtils.deriveMarkerFileFromDestination(destinationFile);
// special logic: force download after a day; don't keep old listings, they are likely out of date!
if (markerFile.exists() && !GenericUtils.isModifiedInLast24Hours(markerFile)) {
// force redownload
if (destinationFile.exists() && !destinationFile.delete()) {
log.warn("Could not delete destination file [{}]; index may not refresh.", destinationFile);
}
if (!markerFile.delete()) {
log.warn("Could not delete marker file [{}]; index may not refresh.", markerFile);
}
}
final File downloaded = MarkerUtils.attemptDownload(markerFile,
destinationFile.getName()+ " of class " + this.getClass().getSimpleName() + ": " + ubuntuPoolDirUrl,
() -> GenericUtils.downloadFile(webAccess,
lsFileUrl,
destinationFile,
this.getClass().getSimpleName() + ": " + ubuntuPoolDirUrl));
if (downloaded == null) {
log.debug("Index creation for pool [{}] aborting: listing download failed.", lsFileUrl);
return;
}
final File luceneIndexLocation = new File(downloaded.getParentFile(), downloaded.getName() + "-lucene");
this.simpleLuceneIndex = new SimpleLuceneIndex(luceneIndexLocation);
// clear index beforehand: just rebuild the index every time.
this.simpleLuceneIndex.clear();
Map> entryMap = null;
// parse file to construct index
try (final InputStream inputStream = Files.newInputStream(downloaded.toPath());
final InputStream decompressed = new GZIPInputStream(inputStream, 65536);
final Reader intermediateReader = new InputStreamReader(decompressed, StandardCharsets.UTF_8);
final BufferedReader reader = new BufferedReader(intermediateReader, 65536)) {
String line = null;
while (true) {
// read next line
line = reader.readLine();
if (line == null) {
// end of file
break;
}
// blank lines are meaningless to us
if (StringUtils.isBlank(line)) {
continue;
}
// prepare line for parsing
line = line.trim();
if (newDirectoryListingPattern.matcher(line).find()) {
// this is a new directory part. write the old one and make a new part to be filled
if (entryMap != null) {
// this is not the first directory. write whatever we have collected so far and write it
simpleLuceneIndex.addEntry(entryMap);
}
entryMap = new HashMap<>();
// cur off the colon at the end
final String lsDirname = line.substring(0, line.length() - 1);
entryMap.put(KEY_DIRNAME, Collections.singletonList(lsDirname));
continue;
}
// otherwise just add the file to the existing map
if (entryMap == null) {
log.debug("Unable to associate line with directory: line [{}].", line);
continue;
}
// derive filename and add it
final String[] lsOutputSplit = StringUtils.split(line);
if (StringUtils.equalsIgnoreCase("total", lsOutputSplit[0]) && lsOutputSplit.length == 2) {
// signifies the total number of files in the directory. useless, so skip it in indexing.
continue;
}
final String filename = lsOutputSplit[lsOutputSplit.length - 1].trim();
if (StringUtils.isBlank(filename)) {
log.trace("Got empty filename trying to add line [{}].", line);
continue;
}
entryMap.computeIfAbsent(KEY_FILENAMES, key -> new LinkedHashSet<>()).add(filename);
}
}
if (entryMap != null) {
simpleLuceneIndex.addEntry(entryMap);
}
simpleLuceneIndex.commit();
}
public void ensureInitialized() throws IOException {
synchronized (initialized) {
if (!initialized.get()) {
log.debug("Initializing lazy index...");
init();
initialized.set(true);
}
// throw exception if luceneIndex was not set
if (simpleLuceneIndex == null) {
throw new IOException("Index is not properly initialized: lucene index is not available.");
}
}
}
private static String urlToDownloadFilename(String url) {
// FIXME: storing hash-based seems like a hack given our current code infrastructure. is there a better way?
final int lastDot = url.lastIndexOf('.');
final String uncleanEnding = lastDot != -1 ? url.substring(lastDot) : "";
final String clean = fileExtensionPattern.matcher(uncleanEnding).find() ? uncleanEnding : "";
return "sha256-" + DigestUtils.sha256Hex(url.getBytes(StandardCharsets.UTF_8)) + clean;
}
@NonNull
public List getDirectoriesFromFilename(@NonNull String filename) throws IOException {
ensureInitialized();
final List foundDocuments = simpleLuceneIndex.lookupContains(KEY_FILENAMES, filename, MAX_HITS);
if (foundDocuments.isEmpty()) {
log.trace("Found no documents for filename [{}] in ubuntu pool index [{}].",
filename,
this.ubuntuPoolDirUrl);
return Collections.emptyList();
}
if (foundDocuments.size() > 1) {
log.debug("Got multiple hits for queried filename [{}].", filename);
}
if (foundDocuments.size() >= MAX_HITS) {
log.debug("Max hits reached while searching ubuntu pool index [{}] for name [{}]. Might lose matches!",
this.ubuntuPoolDirUrl,
filename);
}
return foundDocuments.stream()
.filter(Objects::nonNull)
.map(doc -> doc.get(KEY_DIRNAME))
// only return pool urls from index
.filter(dirInIndex -> dirInIndex.startsWith("./pool/"))
// reconstruct path using partial path as per the position of the "ls-lR.gz" file
.map(dirInIndex -> this.ubuntuPoolDirUrlWithoutLastPathElement + "/" +
dirInIndex.replaceFirst("^\\./", ""))
.collect(Collectors.toList());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy