All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.metaeffekt.artifact.resolver.maven.index.MavenCentralIndex Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2021-2024 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.metaeffekt.artifact.resolver.maven.index;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
import com.fasterxml.jackson.databind.type.TypeFactory;
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.metaeffekt.artifact.resolver.download.WebAccess;
import org.metaeffekt.artifact.resolver.generic.utils.MarkerQueryResult;
import org.metaeffekt.artifact.resolver.generic.utils.MarkerUtils;
import org.metaeffekt.artifact.resolver.maven.index.lucene.MavenLuceneIndex;
import org.metaeffekt.artifact.resolver.maven.index.lucene.MavenLuceneIndexer;
import org.metaeffekt.artifact.resolver.model.DownloadLocation;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.zip.GZIPInputStream;

/**
 * Index of maven central.
 * 
* Constructed from a preprocessed dump of their custom data format. *
* Only works for Maven Central, reconstructing the index if the input file's digest changed. */ @Slf4j public class MavenCentralIndex implements AutoCloseable { private static final int HASH_BUFFER_SIZE = 1048576; private static final String DUMP_HASH_KEY = "DUMP-HASH"; private final AtomicBoolean initialized = new AtomicBoolean(false); private final WebAccess webAccess; private final MavenCentralIndexConfig config; private final File dumpFile; private final File luceneIndexDir; private MavenLuceneIndex mavenLuceneIndex = null; public MavenCentralIndex(DownloadLocation downloadLocation, WebAccess webAccess, MavenCentralIndexConfig config) { this.webAccess = webAccess; this.config = config; this.dumpFile = downloadLocation.deriveDownloadFolder("maven-repo-index", "dumps"); this.luceneIndexDir = downloadLocation.deriveDownloadFolder("maven-repo-index", "index"); } private static String getSha512Hex(InputStream inputStream) throws IOException { final long hashStart = System.currentTimeMillis(); // get digest final MessageDigest digest; try { digest = MessageDigest.getInstance("SHA-512"); } catch (NoSuchAlgorithmException e) { // should not happen throw new RuntimeException("Should never happen: Could not find valid algorithm.", e); } long contentSize = 0; final byte[] buffer = new byte[HASH_BUFFER_SIZE]; int read = inputStream.read(buffer, 0, HASH_BUFFER_SIZE); while (read != -1) { digest.update(buffer, 0, read); contentSize += read; read = inputStream.read(buffer, 0, HASH_BUFFER_SIZE); } final String contentHash = Hex.encodeHexString(digest.digest()); final long hashFinish = System.currentTimeMillis(); final long durationMs = hashFinish - hashStart; final double durationS = (double) durationMs / 1000.0; final double contentSizeMiB = ((double) (contentSize / 1048576)); final double megabytesPerSecond = contentSizeMiB / durationS; log.debug("Hashed input [{}]MiB in [{}]ms ([{}]MiB/s).", contentSizeMiB, durationMs, String.format("%.2f", megabytesPerSecond)); return contentHash; } private static String getSha512Hex(File toHash) throws IOException { try (final InputStream inputStream = Files.newInputStream(toHash.toPath())) { return getSha512Hex(inputStream); } } private static long fillIndex(MavenLuceneIndexer mavenLuceneIndexer, File jsonDumpGzip) throws IOException { final AtomicLong total = new AtomicLong(0); log.trace("Clearing index..."); // delete all previous entries to make for a fresh index mavenLuceneIndexer.clear(); log.trace("Filling index..."); try (final GZIPInputStream inputStream = new GZIPInputStream(Files.newInputStream(jsonDumpGzip.toPath()), 1048576)) { final ObjectReader objectReader = new ObjectMapper().readerFor( TypeFactory.defaultInstance().constructMapType(HashMap.class, String.class, HashSet.class)); log.info("Reading maven central index with stream of lines and parallel indexing..."); try (final InputStreamReader streamReader = new InputStreamReader(inputStream, StandardCharsets.UTF_8); final BufferedReader bufferedReader = new BufferedReader(streamReader, 1048576)) { // stream the lines and run indexing in parallel, bounded by reader speed bufferedReader.lines().parallel().forEach(line -> { // parse each line and add it as an entry try { mavenLuceneIndexer.addEntry(objectReader.readValue(line)); } catch (IOException e) { throw new RuntimeException(e); } // increment total and capture it to print progress final long totalSnapshot = total.incrementAndGet(); // print progress if (totalSnapshot % 10000000 == 0) { log.debug("Imported so far: [{}]", totalSnapshot); } }); log.debug("Committing filled index..."); // ensure we wrote everything cleanly before proceeding mavenLuceneIndexer.commit(); } } log.trace("Created [{}] documents.", total.get()); return total.get(); } private static void deleteFile(File target) { if (!target.delete()) { log.warn("Deletion failed unexpectedly: [{}]. Permission issue?", target.toPath()); } } private boolean isLuceneIndexRelated(String localIndexDumpHash) { try { try (final MavenLuceneIndex index = new MavenLuceneIndex(luceneIndexDir, new WhitespaceAnalyzer())) { // if the index contained this hash as a dump hash key, we built it over the same dump return !index.lookupContains(DUMP_HASH_KEY, localIndexDumpHash, 64).isEmpty(); } } catch (Exception e) { log.info("Irregularity while checking relatedness of local dump and local lucene index. Returning false."); return false; } } private synchronized void init() throws IOException { log.info("Initializing Maven Central Index..."); final File ndjsonDumpGzip = deriveLocalIndexDumpFile(); // calculate hash for comparison against remote and for checking that the lucene index is related final String localIndexDumpHash = getLocalIndexDumpHash(ndjsonDumpGzip); boolean requiresIndexUpdate = downloadOrReuseIndexDump(ndjsonDumpGzip, luceneIndexDir, localIndexDumpHash); requiresIndexUpdate |= !isLuceneIndexRelated(localIndexDumpHash); if (requiresIndexUpdate) { createIndex(ndjsonDumpGzip, localIndexDumpHash); } // NOTE: nexus has a custom analyzer; we currently use the lucene defaults mavenLuceneIndex = new MavenLuceneIndex(luceneIndexDir, new WhitespaceAnalyzer()); // dummy lookup since we want to guarantee loading at this very point mavenLuceneIndex.lookupContains("a", "b", 64); log.info("Initializing Maven Central Index completed. Operating on [{}] entries.", mavenLuceneIndex.size()); } private void createIndex(File ndjsonDumpGzip, final String indexContentHash) throws IOException { final String debugRef = this.getClass().getSimpleName(); final File luceneDirDummyFile = new File(luceneIndexDir, ".DUMMY"); final File markerFile = MarkerUtils.deriveMarkerFileFromDestination(luceneDirDummyFile); final MarkerQueryResult markerQueryResult = MarkerUtils.queryMarker(markerFile, debugRef); if (markerQueryResult.getFoundTarget() == null) { log.debug("Rebuilding index from json dump..."); // marker handling (preliminary deletion of an now-outdated marker file) if (markerFile.exists() && !markerFile.delete()) { log.warn("Failed to delete marker [{}] while building new index. Might lead to errors.", markerFile.toPath()); } long total; try (MavenLuceneIndexer mavenLuceneIndexer = new MavenLuceneIndexer(luceneIndexDir, new WhitespaceAnalyzer())) { total = fillIndex(mavenLuceneIndexer, ndjsonDumpGzip); // adding extra field of hash of input dump to check validity / readability of the index later mavenLuceneIndexer.addEntry( Collections.singletonMap(DUMP_HASH_KEY, Collections.singleton(indexContentHash))); // marker handling (success) try { if (!luceneDirDummyFile.exists() && !luceneDirDummyFile.createNewFile()) { log.warn("Could not create dummy file for marker handling. Issue with lucene destination dir?"); } Files.write(luceneDirDummyFile.toPath(), "- dummy content -".getBytes(StandardCharsets.UTF_8)); MarkerUtils.markSuccess(luceneDirDummyFile, markerFile, debugRef); } catch (IOException e) { log.warn("Could not create marker [{}] for built index [{}].", markerFile, debugRef, e); // discard exception in the name of resilience, however future runs may be impacted } } catch (Exception e) { throw new IOException(e.getMessage(), e); } log.info("Maven central index constructed with [{}] entries.", total); } else { log.debug("Reusing previously initialized index."); } } /** * Gets a sha512 of a downloaded index dump. * @param localIndexDumpFile the locally downloaded dump file * @return returns a hex-encoded sha512 * @throws IOException throws on failure to read the file */ private String getLocalIndexDumpHash(File localIndexDumpFile) throws IOException{ // determine hash of file content log.debug("Hashing input json dump..."); String indexContentHash = null; if (localIndexDumpFile.exists()) { try { indexContentHash = getSha512Hex(localIndexDumpFile).trim(); } catch (IOException e) { log.error("Error while trying to get hash of existing dump at [{}].", localIndexDumpFile.toPath()); throw e; } } return indexContentHash; } private boolean downloadOrReuseIndexDump(File localIndexDumpFile, File localLuceneIndexDir, String localIndexDumpHash) throws IOException { // check whether local file exists if (localIndexDumpFile.exists() && !Files.isRegularFile(localIndexDumpFile.toPath())) { log.error("Input isn't a regular file: [{}]. Initialization failure.", localIndexDumpFile.toPath()); throw new IOException("Expected regular file at configured dump json location."); } // download precomputed hash of remote file from web server final String remoteHash; try (WebAccess.WebSession session = webAccess.createSession()) { final String sha512Uri = config.getNdjsonDumpGzipUrl() + ".sha512"; remoteHash = session.downloadToUtf8String(sha512Uri, (response) -> log.error( "Could not get .sha512 file from intended download server [{}] due to http status [{}].", sha512Uri, response.getStatusLine().getStatusCode())).map(String::trim).orElse(null); } if (remoteHash == null && localIndexDumpFile.exists()) { log.warn("Could not update index. Remote server was not available. Using possibly outdated data!"); log.info("Reusing possibly outdated local data with sha512 hash [{}].", localIndexDumpHash); return false; } if (!localIndexDumpFile.exists() || !StringUtils.equals(localIndexDumpHash, remoteHash)) { // file changed; force update. log.info("Local index dump at [{}] {}. Downloading to [{}].", localIndexDumpFile, (localIndexDumpFile.exists() ? "outdated" : "missing"), localIndexDumpFile.toPath()); final File ndjsonDumpGzipMarker = MarkerUtils.deriveMarkerFileFromDestination(localIndexDumpFile); if (localIndexDumpFile.exists()) deleteFile(localIndexDumpFile); if (ndjsonDumpGzipMarker.exists()) deleteFile(ndjsonDumpGzipMarker); // download dump final String debugRef = "maven central index: download to [" + localIndexDumpFile.toPath() + "]"; final File downloaded = MarkerUtils.attemptDownload(this.webAccess, config.getNdjsonDumpGzipUrl(), localIndexDumpFile, debugRef); if (downloaded == null) { throw new IOException("Could not download source nexus index dump."); } try { log.debug("Rehashing downloaded index dump to verify download integrity."); localIndexDumpHash = getSha512Hex(localIndexDumpFile); } catch (IOException e) { log.error("Could not update cryptographic hash of downloaded dump at [{}].", localIndexDumpFile.toPath()); throw e; } if (!StringUtils.equals(localIndexDumpHash, remoteHash)) { log.error("Download to [{}] failed to pass checksum check! Trying to use invalid index.", localIndexDumpFile.toPath()); MarkerUtils.invalidateMarkerFor(downloaded, debugRef); } // in case the input dataset has changed, an update is required return true; } else { log.debug("Using existing dump from [{}].", localIndexDumpFile.toPath()); } if (localIndexDumpHash == null) { throw new IOException("Hash of index content null; must be previous failure while downloading index dump."); } // nothing changed; nothing problematic return !localLuceneIndexDir.exists(); } @NonNull private File deriveLocalIndexDumpFile() { if (config.getNdjsonDumpGzipPath() != null) { return new File(config.getNdjsonDumpGzipPath()); } else if (config.getNdjsonDumpGzipUrl() != null) { return new File(dumpFile, "downloaded-index-dump.ndjson.gz"); } else { log.debug("Aborting initialization: insufficient configuration."); throw new IllegalArgumentException("Could not derive ndjsonDumpGzip, as paths were missing in config."); } } public void ensureInitialized() throws IOException { synchronized (initialized) { if (!initialized.get()) { log.debug("Attempting index initialization..."); init(); initialized.set(true); } // throw exception if luceneIndex was not set if (mavenLuceneIndex == null) { throw new IOException("Index is not properly initialized: lucene index is not available."); } } } public @NonNull List queryByGAV(String groupId, @NonNull String artifactId, String version, String classifier, String extensionOrPackaging, int n) throws IOException { ensureInitialized(); final BooleanQuery.Builder builder = new BooleanQuery.Builder(); if (groupId != null) builder.add(new TermQuery(new Term("g", groupId)), BooleanClause.Occur.MUST); if (artifactId != null) builder.add(new TermQuery(new Term("a", artifactId)), BooleanClause.Occur.MUST); if (version != null) builder.add(new TermQuery(new Term("v", version)), BooleanClause.Occur.MUST); if (classifier != null) builder.add(new TermQuery(new Term("c", classifier)), BooleanClause.Occur.MUST); if (extensionOrPackaging != null) builder.add(new TermQuery(new Term("p", extensionOrPackaging)), BooleanClause.Occur.MUST); BooleanQuery bq = builder.build(); return mavenLuceneIndex.runQuery(bq, n); } @Override public synchronized void close() throws Exception { synchronized (initialized) { if (mavenLuceneIndex != null) { mavenLuceneIndex.close(); } } } /** * Queries the index using the given query string. * * @param filenameQueryString query string to query filename fields for (uses WildcardQuery) * @param n maximum number of documents to return * @return returns a list of matching documents * @throws IOException throws on failure * @see #sanitizeForWildcardQuery(String) */ public List queryByFilename(@NonNull String filenameQueryString, int n) throws IOException { ensureInitialized(); final BooleanQuery.Builder builder = new BooleanQuery.Builder(); builder.add(new WildcardQuery(new Term("df", filenameQueryString)), BooleanClause.Occur.SHOULD); builder.add(new WildcardQuery(new Term("ef", filenameQueryString)), BooleanClause.Occur.SHOULD); return mavenLuceneIndex.runQuery(builder.build(), n); } public static String sanitizeForWildcardQuery(final String literalString) { return literalString.replace("\\", "\\\\").replace("*", "\\*").replace("?", "\\?"); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy