org.metaeffekt.artifact.resolver.maven.index.MavenCentralIndex Maven / Gradle / Ivy
/*
* Copyright 2021-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.metaeffekt.artifact.resolver.maven.index;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
import com.fasterxml.jackson.databind.type.TypeFactory;
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.metaeffekt.artifact.resolver.download.WebAccess;
import org.metaeffekt.artifact.resolver.generic.utils.MarkerQueryResult;
import org.metaeffekt.artifact.resolver.generic.utils.MarkerUtils;
import org.metaeffekt.artifact.resolver.maven.index.lucene.MavenLuceneIndex;
import org.metaeffekt.artifact.resolver.maven.index.lucene.MavenLuceneIndexer;
import org.metaeffekt.artifact.resolver.model.DownloadLocation;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.zip.GZIPInputStream;
/**
* Index of maven central.
*
* Constructed from a preprocessed dump of their custom data format.
*
* Only works for Maven Central, reconstructing the index if the input file's digest changed.
*/
@Slf4j
public class MavenCentralIndex implements AutoCloseable {
private static final int HASH_BUFFER_SIZE = 1048576;
private static final String DUMP_HASH_KEY = "DUMP-HASH";
private final AtomicBoolean initialized = new AtomicBoolean(false);
private final WebAccess webAccess;
private final MavenCentralIndexConfig config;
private final File dumpFile;
private final File luceneIndexDir;
private MavenLuceneIndex mavenLuceneIndex = null;
public MavenCentralIndex(DownloadLocation downloadLocation, WebAccess webAccess, MavenCentralIndexConfig config) {
this.webAccess = webAccess;
this.config = config;
this.dumpFile = downloadLocation.deriveDownloadFolder("maven-repo-index", "dumps");
this.luceneIndexDir = downloadLocation.deriveDownloadFolder("maven-repo-index", "index");
}
private static String getSha512Hex(InputStream inputStream) throws IOException {
final long hashStart = System.currentTimeMillis();
// get digest
final MessageDigest digest;
try {
digest = MessageDigest.getInstance("SHA-512");
} catch (NoSuchAlgorithmException e) {
// should not happen
throw new RuntimeException("Should never happen: Could not find valid algorithm.", e);
}
long contentSize = 0;
final byte[] buffer = new byte[HASH_BUFFER_SIZE];
int read = inputStream.read(buffer, 0, HASH_BUFFER_SIZE);
while (read != -1) {
digest.update(buffer, 0, read);
contentSize += read;
read = inputStream.read(buffer, 0, HASH_BUFFER_SIZE);
}
final String contentHash = Hex.encodeHexString(digest.digest());
final long hashFinish = System.currentTimeMillis();
final long durationMs = hashFinish - hashStart;
final double durationS = (double) durationMs / 1000.0;
final double contentSizeMiB = ((double) (contentSize / 1048576));
final double megabytesPerSecond = contentSizeMiB / durationS;
log.debug("Hashed input [{}]MiB in [{}]ms ([{}]MiB/s).", contentSizeMiB, durationMs,
String.format("%.2f", megabytesPerSecond));
return contentHash;
}
private static String getSha512Hex(File toHash) throws IOException {
try (final InputStream inputStream = Files.newInputStream(toHash.toPath())) {
return getSha512Hex(inputStream);
}
}
private static long fillIndex(MavenLuceneIndexer mavenLuceneIndexer, File jsonDumpGzip) throws IOException {
final AtomicLong total = new AtomicLong(0);
log.trace("Clearing index...");
// delete all previous entries to make for a fresh index
mavenLuceneIndexer.clear();
log.trace("Filling index...");
try (final GZIPInputStream inputStream = new GZIPInputStream(Files.newInputStream(jsonDumpGzip.toPath()), 1048576)) {
final ObjectReader objectReader = new ObjectMapper().readerFor(
TypeFactory.defaultInstance().constructMapType(HashMap.class, String.class, HashSet.class));
log.info("Reading maven central index with stream of lines and parallel indexing...");
try (final InputStreamReader streamReader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
final BufferedReader bufferedReader = new BufferedReader(streamReader, 1048576)) {
// stream the lines and run indexing in parallel, bounded by reader speed
bufferedReader.lines().parallel().forEach(line -> {
// parse each line and add it as an entry
try {
mavenLuceneIndexer.addEntry(objectReader.readValue(line));
} catch (IOException e) {
throw new RuntimeException(e);
}
// increment total and capture it to print progress
final long totalSnapshot = total.incrementAndGet();
// print progress
if (totalSnapshot % 10000000 == 0) {
log.debug("Imported so far: [{}]", totalSnapshot);
}
});
log.debug("Committing filled index...");
// ensure we wrote everything cleanly before proceeding
mavenLuceneIndexer.commit();
}
}
log.trace("Created [{}] documents.", total.get());
return total.get();
}
private static void deleteFile(File target) {
if (!target.delete()) {
log.warn("Deletion failed unexpectedly: [{}]. Permission issue?", target.toPath());
}
}
private boolean isLuceneIndexCorrelated(String localIndexDumpHash) {
try {
try (final MavenLuceneIndex index = new MavenLuceneIndex(luceneIndexDir, new WhitespaceAnalyzer())) {
// if the index contained this hash as a dump hash key, we built it over the same dump
return !index.lookupContains(DUMP_HASH_KEY, localIndexDumpHash, 64).isEmpty();
}
} catch (Exception e) {
log.warn("Irregularity while checking correlation of local dump and local lucene index.");
return false;
}
}
private synchronized void init() throws IOException {
log.info("Initializing Maven Central Index...");
final File ndjsonDumpGzip = deriveLocalIndexDumpFile();
// calculate hash for comparison against remote and for checking that the lucene index is related
final String localIndexDumpHash = getLocalIndexDumpHash(ndjsonDumpGzip);
boolean requiresIndexUpdate =
downloadOrReuseIndexDump(ndjsonDumpGzip, luceneIndexDir, localIndexDumpHash);
requiresIndexUpdate |= !isLuceneIndexCorrelated(localIndexDumpHash);
if (requiresIndexUpdate) {
createIndex(ndjsonDumpGzip, localIndexDumpHash);
}
// NOTE: nexus has a custom analyzer; we currently use the lucene defaults
mavenLuceneIndex = new MavenLuceneIndex(luceneIndexDir, new WhitespaceAnalyzer());
// dummy lookup since we want to guarantee loading at this very point
mavenLuceneIndex.lookupContains("a", "b", 64);
log.info("Initializing Maven Central Index completed. Operating on [{}] entries.", mavenLuceneIndex.size());
}
private void createIndex(File ndjsonDumpGzip, final String indexContentHash) throws IOException {
final String debugRef = this.getClass().getSimpleName();
final File luceneDirDummyFile = new File(luceneIndexDir, ".DUMMY");
final File markerFile = MarkerUtils.deriveMarkerFileFromDestination(luceneDirDummyFile);
final MarkerQueryResult markerQueryResult = MarkerUtils.queryMarker(markerFile, debugRef);
if (markerQueryResult.getFoundTarget() == null) {
log.debug("Rebuilding index from json dump...");
// marker handling (preliminary deletion of an now-outdated marker file)
if (markerFile.exists() && !markerFile.delete()) {
log.warn("Failed to delete marker [{}] while building new index. Might lead to errors.",
markerFile.toPath());
}
long total;
try (MavenLuceneIndexer mavenLuceneIndexer = new MavenLuceneIndexer(luceneIndexDir, new WhitespaceAnalyzer())) {
total = fillIndex(mavenLuceneIndexer, ndjsonDumpGzip);
// adding extra field of hash of input dump to check validity / readability of the index later
mavenLuceneIndexer.addEntry(
Collections.singletonMap(DUMP_HASH_KEY, Collections.singleton(indexContentHash)));
// marker handling (success)
try {
if (!luceneDirDummyFile.exists() && !luceneDirDummyFile.createNewFile()) {
log.warn("Could not create dummy file for marker handling. Issue with lucene destination dir?");
}
Files.write(luceneDirDummyFile.toPath(), "- dummy content -".getBytes(StandardCharsets.UTF_8));
MarkerUtils.markSuccess(luceneDirDummyFile, markerFile, debugRef);
} catch (IOException e) {
log.warn("Could not create marker [{}] for built index [{}].", markerFile, debugRef, e);
// discard exception in the name of resilience, however future runs may be impacted
}
} catch (Exception e) {
throw new IOException(e.getMessage(), e);
}
log.info("Maven central index constructed with [{}] entries.", total);
} else {
log.debug("Reusing previously initialized index.");
}
}
/**
* Gets a sha512 of a downloaded index dump.
* @param localIndexDumpFile the locally downloaded dump file
* @return returns a hex-encoded sha512
* @throws IOException throws on failure to read the file
*/
private String getLocalIndexDumpHash(File localIndexDumpFile) throws IOException{
// determine hash of file content
log.debug("Hashing input json dump...");
String indexContentHash = null;
if (localIndexDumpFile.exists()) {
try {
indexContentHash = getSha512Hex(localIndexDumpFile).trim();
} catch (IOException e) {
log.error("Error while trying to get hash of existing dump at [{}].", localIndexDumpFile.toPath());
throw e;
}
}
return indexContentHash;
}
/**
* Downloads and checks remoter index dump.
*
* @param localIndexDumpFile The existing local dump file.
* @param localLuceneIndexDir The existing local index dir
* @param localIndexDumpHash The existing local hash of the dump file.
*
* @return Returns true
in case the index must be redone; false
in case not. The method
* also returns an false
in case of error obtaining and validating the local or remote index data.
*/
private boolean downloadOrReuseIndexDump(File localIndexDumpFile, File localLuceneIndexDir, String localIndexDumpHash) {
// check whether local file exists and is a regular file
if (localIndexDumpFile.exists() && !Files.isRegularFile(localIndexDumpFile.toPath())) {
log.warn("Local index dump isn't a regular file: [{}]", localIndexDumpFile.toPath());
log.info("Reusing possibly outdated local data with sha512 hash [{}].", localIndexDumpHash);
return false;
}
// download precomputed hash of remote file from web server
final String remoteHash;
try {
try (WebAccess.WebSession session = webAccess.createSession()) {
final String sha512Uri = config.getNdjsonDumpGzipUrl() + ".sha512";
remoteHash = session.downloadToUtf8String(sha512Uri, (response) -> log.error(
"Could not get .sha512 file from intended download server [{}]. HTTP status: [{}]", sha512Uri,
response.getStatusLine().getStatusCode())).map(String::trim).orElse(null);
}
} catch (IOException e) {
log.warn("Could not obtain hash of remote index dump.");
log.info("Reusing possibly outdated local data with sha512 hash [{}].", localIndexDumpHash);
return false;
}
if (remoteHash == null && localIndexDumpFile.exists()) {
log.warn("Could not update index. Remote server was not available. Using possibly outdated data!");
log.info("Reusing possibly outdated local data with sha512 hash [{}].", localIndexDumpHash);
return false;
}
if (!localIndexDumpFile.exists() || !StringUtils.equals(localIndexDumpHash, remoteHash)) {
// file changed; force update.
log.info("Local index dump at [{}] {}. Downloading to [{}].", localIndexDumpFile,
(localIndexDumpFile.exists() ? "outdated" : "missing"), localIndexDumpFile.toPath());
final File ndjsonDumpGzipMarker = MarkerUtils.deriveMarkerFileFromDestination(localIndexDumpFile);
if (localIndexDumpFile.exists()) deleteFile(localIndexDumpFile);
if (ndjsonDumpGzipMarker.exists()) deleteFile(ndjsonDumpGzipMarker);
// download dump
final String debugRef = "maven central index: download to [" + localIndexDumpFile.toPath() + "]";
final File downloaded = MarkerUtils.attemptDownload(this.webAccess, config.getNdjsonDumpGzipUrl(), localIndexDumpFile, debugRef);
if (downloaded == null) {
log.warn("Could not download source nexus index dump.");
log.info("Reusing possibly outdated local data with sha512 hash [{}].", localIndexDumpHash);
return false;
}
try {
log.debug("Rehashing downloaded index dump to verify download integrity.");
localIndexDumpHash = getSha512Hex(localIndexDumpFile);
} catch (IOException e) {
log.warn("Could not compute hash of downloaded dump at [{}].", localIndexDumpFile.toPath());
log.info("Reusing possibly outdated local data with sha512 hash [{}].", localIndexDumpHash);
return false;
}
if (!StringUtils.equals(localIndexDumpHash, remoteHash)) {
log.error("Download to [{}] failed to pass checksum check.", localIndexDumpFile.toPath());
MarkerUtils.invalidateMarkerFor(downloaded, debugRef);
log.info("Reusing possibly outdated local data with sha512 hash [{}].", localIndexDumpHash);
return false;
}
// in case the input dataset has changed, an update is required
return true;
} else {
log.debug("Using existing dump from [{}].", localIndexDumpFile.toPath());
}
if (localIndexDumpHash == null) {
log.warn("Hash of index content null; must be previous failure while downloading index dump.");
return true;
}
// nothing changed; nothing problematic
return !localLuceneIndexDir.exists();
}
@NonNull
private File deriveLocalIndexDumpFile() {
if (config.getNdjsonDumpGzipPath() != null) {
return new File(config.getNdjsonDumpGzipPath());
} else if (config.getNdjsonDumpGzipUrl() != null) {
return new File(dumpFile,
"downloaded-index-dump.ndjson.gz");
} else {
log.debug("Aborting initialization: insufficient configuration.");
throw new IllegalArgumentException("Could not derive ndjsonDumpGzip, as paths were missing in config.");
}
}
public void ensureInitialized() throws IOException {
synchronized (initialized) {
if (!initialized.get()) {
log.debug("Attempting index initialization...");
init();
initialized.set(true);
}
// throw exception if luceneIndex was not set
if (mavenLuceneIndex == null) {
throw new IOException("Index is not properly initialized: lucene index is not available.");
}
}
}
public @NonNull List queryByGAV(String groupId, @NonNull String artifactId,
String version, String classifier, String extensionOrPackaging,
int n) throws IOException {
ensureInitialized();
final BooleanQuery.Builder builder = new BooleanQuery.Builder();
if (groupId != null) builder.add(new TermQuery(new Term("g", groupId)), BooleanClause.Occur.MUST);
if (artifactId != null) builder.add(new TermQuery(new Term("a", artifactId)), BooleanClause.Occur.MUST);
if (version != null) builder.add(new TermQuery(new Term("v", version)), BooleanClause.Occur.MUST);
if (classifier != null) builder.add(new TermQuery(new Term("c", classifier)), BooleanClause.Occur.MUST);
if (extensionOrPackaging != null) builder.add(new TermQuery(new Term("p", extensionOrPackaging)), BooleanClause.Occur.MUST);
BooleanQuery bq = builder.build();
return mavenLuceneIndex.runQuery(bq, n);
}
@Override
public synchronized void close() throws Exception {
synchronized (initialized) {
if (mavenLuceneIndex != null) {
mavenLuceneIndex.close();
}
}
}
/**
* Queries the index using the given query string.
*
* @param filenameQueryString query string to query filename fields for (uses WildcardQuery)
* @param n maximum number of documents to return
* @return returns a list of matching documents
* @throws IOException throws on failure
* @see #sanitizeForWildcardQuery(String)
*/
public List queryByFilename(@NonNull String filenameQueryString, int n) throws IOException {
ensureInitialized();
final BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new WildcardQuery(new Term("df", filenameQueryString)), BooleanClause.Occur.SHOULD);
builder.add(new WildcardQuery(new Term("ef", filenameQueryString)), BooleanClause.Occur.SHOULD);
return mavenLuceneIndex.runQuery(builder.build(), n);
}
public static String sanitizeForWildcardQuery(final String literalString) {
return literalString.replace("\\", "\\\\").replace("*", "\\*").replace("?", "\\?");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy