com.metaeffekt.mirror.index.Index Maven / Gradle / Ivy
/*
* Copyright 2021-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metaeffekt.mirror.index;
import com.metaeffekt.artifact.analysis.utils.BuildProperties;
import com.metaeffekt.artifact.analysis.utils.FileUtils;
import com.metaeffekt.artifact.analysis.utils.StringUtils;
import com.metaeffekt.artifact.analysis.utils.TimeUtils;
import com.metaeffekt.mirror.Mirror;
import com.metaeffekt.mirror.download.documentation.MirrorMetadata;
import com.metaeffekt.mirror.concurrency.ScheduledDelayedThreadPoolExecutor;
import com.metaeffekt.mirror.download.Download;
import com.metaeffekt.mirror.index.advisor.*;
import com.metaeffekt.mirror.index.nvd.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* Uses lucene index to create an index of the data specified by the subclasses.
*/
public abstract class Index extends Mirror {
private final static Logger LOG = LoggerFactory.getLogger(Index.class);
protected final File indexIntoDirectory;
protected int lockFileTimeout = 10 * 60 * 1000; // 10 minutes
protected final ScheduledDelayedThreadPoolExecutor executor = new ScheduledDelayedThreadPoolExecutor(16, 0);
protected final Directory luceneDirectory;
private IndexReader cachedIndexReader;
private org.apache.lucene.search.IndexSearcher cachedIndexSearcher;
protected final File[] requiredDownloads;
protected final Index[] requiredIndexes;
protected final File[] optionalDownloads;
protected final Index[] optionalIndexes;
public Index(File baseMirrorDirectory, Class extends Index> indexClass, List> requiredDownloads, List> requiredIndexes) {
this(baseMirrorDirectory, indexClass, requiredDownloads, requiredIndexes, Collections.emptyList(), Collections.emptyList());
}
public Index(File baseMirrorDirectory, Class extends Index> indexClass, List> requiredDownloads, List> requiredIndexes, List> optionalDownloads, List> optionalIndexes) {
super(baseMirrorDirectory, Index.getDirectoryIdentifier(indexClass));
this.indexIntoDirectory = new File(new File(super.baseMirrorDirectory, "index"), super.mirrorIdentifier);
if (!this.indexIntoDirectory.exists()) {
this.indexIntoDirectory.mkdirs();
}
try {
luceneDirectory = FSDirectory.open(indexIntoDirectory.toPath());
} catch (IOException e) {
throw new RuntimeException("Unable to open lucene index directory inside " + baseMirrorDirectory.getAbsolutePath(), e);
}
this.requiredDownloads = new File[requiredDownloads.size()];
for (int i = 0; i < requiredDownloads.size(); i++) {
this.requiredDownloads[i] = new File(new File(super.baseMirrorDirectory, "download"), Download.getDirectoryIdentifier(requiredDownloads.get(i)));
if (!this.requiredDownloads[i].exists()) {
LOG.debug("Required download does not exist in {} for index [{}] (this can be ignored if only read access is required) in {}", this.requiredDownloads[i].getClass().getSimpleName(), mirrorIdentifier, getClass().getName());
}
}
this.requiredIndexes = new Index[requiredIndexes.size()];
for (int i = 0; i < requiredIndexes.size(); i++) {
this.requiredIndexes[i] = Index.getInstance(requiredIndexes.get(i), baseMirrorDirectory);
try {
this.requiredIndexes[i].assertExists();
} catch (Exception e) {
LOG.debug("Required index does not exist in {} for index [{}] (this can be ignored if only read access is required): {} in {}", this.requiredIndexes[i].getClass().getSimpleName(), mirrorIdentifier, e.getMessage(), getClass().getSimpleName());
}
}
this.optionalDownloads = new File[optionalDownloads.size()];
for (int i = 0; i < optionalDownloads.size(); i++) {
this.optionalDownloads[i] = new File(new File(super.baseMirrorDirectory, "download"), Download.getDirectoryIdentifier(optionalDownloads.get(i)));
if (!this.optionalDownloads[i].exists()) {
LOG.debug("Optional download does not exist in {} for index [{}] (this can be ignored if index is not required) in {}", this.optionalDownloads[i].getClass().getSimpleName(), mirrorIdentifier, getClass().getName());
}
}
this.optionalIndexes = new Index[optionalIndexes.size()];
for (int i = 0; i < optionalIndexes.size(); i++) {
this.optionalIndexes[i] = Index.getInstance(optionalIndexes.get(i), baseMirrorDirectory);
try {
this.optionalIndexes[i].assertExists();
} catch (Exception e) {
LOG.debug("Optional index does not exist in {} for index [{}] (this can be ignored if index is not required): {} in {}", this.optionalIndexes[i].getClass().getSimpleName(), mirrorIdentifier, e.getMessage(), getClass().getSimpleName());
}
}
}
public void setLockFileTimeout(int lockFileTimeout) {
this.lockFileTimeout = lockFileTimeout;
}
public T getRequiredIndex(Class indexClass) {
for (Index index : this.requiredIndexes) {
if (indexClass.isInstance(index)) {
return indexClass.cast(index);
}
}
return null;
}
public File getIndexIntoDirectory() {
return indexIntoDirectory;
}
public void assertExists() {
if (!this.indexIntoDirectory.exists()) {
throw new RuntimeException("Index directory does not exist: " + this.indexIntoDirectory);
} else if (!this.indexIntoDirectory.isDirectory()) {
throw new RuntimeException("Index directory is not a directory: " + this.indexIntoDirectory);
} else if (this.indexIntoDirectory.listFiles().length < 2) {
throw new RuntimeException("Index directory is empty: " + this.indexIntoDirectory);
}
}
public void createIndexIfRequired() {
boolean outcome = false;
try {
super.logTitle("");
super.waitForFileUnlockIfLocked(indexIntoDirectory, lockFileTimeout);
super.lockFile(indexIntoDirectory);
if (isIndexingRequired()) {
createIndexBackup();
clearIndex();
createIndex();
setLastUpdatedToNow();
} else {
LOG.info("Index is already up to date: {}", indexIntoDirectory);
}
super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.INDEX_FAILED_FLAG.getKey(), "false");
outcome = true;
} catch (Exception e) {
try {
super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.INDEX_FAILED_FLAG.getKey(), "true");
} catch (Exception ignored) {
}
loadIndexBackup();
throw new RuntimeException("Unable to update index in " + indexIntoDirectory + "\n" + e.getMessage(), e);
} finally {
removeIndexBackup();
super.unlockFile(indexIntoDirectory);
if (outcome) {
super.logTitle("Done: ");
} else {
super.logTitle("FAILED: ");
}
}
}
protected void createIndex() {
assertRequiredDownloadsExist();
LOG.info("Creating index documents for index in: {}", indexIntoDirectory);
final Map indexDocuments = createIndexDocuments();
if (indexDocuments.size() > 0) {
writeIndexDocuments(indexDocuments);
}
}
private void createIndexBackup() {
try {
final File backupDirectory = new File(indexIntoDirectory.getParentFile(), indexIntoDirectory.getName() + "-backup");
if (backupDirectory.exists()) {
FileUtils.cleanDirectory(backupDirectory);
}
if (!indexIntoDirectory.exists()) {
LOG.warn("Index directory does not exist, unable to create backup: {}", indexIntoDirectory);
return;
}
FileUtils.copyDirectory(indexIntoDirectory, backupDirectory);
LOG.info("Created index backup in: {}", backupDirectory);
} catch (IOException e) {
LOG.error("Unable to create index backup for " + indexIntoDirectory.getAbsolutePath(), e);
}
}
private void loadIndexBackup() {
try {
final File backupDirectory = new File(indexIntoDirectory.getParentFile(), indexIntoDirectory.getName() + "-backup");
if (!backupDirectory.exists()) {
LOG.error("Unable to load index backup for " + indexIntoDirectory.getAbsolutePath() + " as it does not exist");
return;
}
if (indexIntoDirectory.exists()) {
FileUtils.cleanDirectory(indexIntoDirectory);
}
FileUtils.copyDirectory(backupDirectory, indexIntoDirectory);
LOG.info("Loaded index backup from: {}", backupDirectory);
} catch (IOException e) {
LOG.error("Unable to load index backup for " + indexIntoDirectory.getAbsolutePath(), e);
}
}
private void removeIndexBackup() {
final File backupDirectory = new File(indexIntoDirectory.getParentFile(), indexIntoDirectory.getName() + "-backup");
if (backupDirectory.exists()) {
FileUtils.deleteDir(backupDirectory);
}
}
protected void writeIndexDocuments(Map indexDocuments) {
LOG.info("Indexing [{}] document{}", indexDocuments.size(), indexDocuments.size() == 1 ? "" : "s");
try {
final IndexWriterConfig luceneWriterConfig = new IndexWriterConfig(Analyzers.getStandardAnalyzer());
luceneWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
final IndexWriter luceneDocumentWriter = new IndexWriter(luceneDirectory, luceneWriterConfig);
for (Map.Entry indexDocument : indexDocuments.entrySet()) {
if (indexDocument.getKey() == null) {
LOG.warn("Index document has no key: {}", indexDocument.getValue());
continue;
}
// https://stackoverflow.com/questions/73484766/lucene-index-query-does-not-find-document-if-too-many-documents-similar-document
final Term term = new Term("doc_id", String.valueOf(indexDocument.getKey().hashCode()));
luceneDocumentWriter.updateDocument(term, indexDocument.getValue());
}
luceneDocumentWriter.close();
resetCachedReaderSearcher();
} catch (IOException e) {
throw new RuntimeException(e);
}
indexDocuments.clear();
}
private void resetCachedReaderSearcher() {
if (cachedIndexReader != null) {
try {
cachedIndexReader.close();
} catch (IOException ignored) {
}
}
cachedIndexReader = null;
cachedIndexSearcher = null;
}
private IndexReader getIndexReader() throws IOException {
if (cachedIndexReader == null) {
try {
cachedIndexReader = DirectoryReader.open(luceneDirectory);
} catch (IOException ignored) {
}
}
return cachedIndexReader;
}
private org.apache.lucene.search.IndexSearcher getIndexSearcher() throws IOException {
if (cachedIndexSearcher == null) {
cachedIndexSearcher = new org.apache.lucene.search.IndexSearcher(getIndexReader());
}
return cachedIndexSearcher;
}
public List findAllDocuments() {
try {
final IndexReader indexReader = getIndexReader();
final int docCount = indexReader.numDocs();
final List documents = new ArrayList<>(docCount);
for (int i = 0; i < docCount; i++) {
documents.add(indexReader.document(i));
}
return documents;
} catch (IOException e) {
throw new RuntimeException("Unable to find all documents in index", e);
}
}
public int documentCount() {
try {
return getIndexReader().numDocs();
} catch (IOException e) {
throw new RuntimeException("Unable to find all documents in index", e);
}
}
public void findAndProcessAllDocuments(Consumer consumer) {
try {
final IndexReader indexReader = getIndexReader();
final int docCount = indexReader.numDocs();
for (int i = 0; i < docCount; i++) {
consumer.accept(indexReader.document(i));
}
} catch (IOException e) {
throw new RuntimeException("Unable to find all documents in index", e);
}
}
public void findAndProcessAllDocumentsCancelable(Function consumer) {
try {
final IndexReader indexReader = getIndexReader();
final int docCount = indexReader.numDocs();
for (int i = 0; i < docCount; i++) {
final boolean continueSearch = consumer.apply(indexReader.document(i));
if (!continueSearch) break;
}
} catch (IOException e) {
throw new RuntimeException("Unable to find all documents in index", e);
}
}
public void findAndProcessAllDocuments(BiConsumer consumer) {
try {
final IndexReader indexReader = getIndexReader();
final int docCount = indexReader.numDocs();
for (int i = 0; i < docCount; i++) {
consumer.accept(indexReader.document(i), i);
}
} catch (IOException e) {
throw new RuntimeException("Unable to find all documents in index", e);
}
}
public List findDocuments(IndexSearch indexSearch) {
try {
return indexSearch.search(getIndexSearcher());
} catch (IOException e) {
throw new RuntimeException("Unable to search index for [" + indexSearch + "]", e);
} catch (ParseException e) {
throw new RuntimeException("Malformed search query for [" + indexSearch + "]", e);
} catch (Exception e) {
throw new RuntimeException("Unknown exception whilst performing query [" + indexSearch + "] on index in " + this.indexIntoDirectory.getAbsolutePath() + " ; make sure that the index exists and is valid", e);
}
}
public List findDocuments(Query query) {
try {
final TopDocs topDocs = getIndexSearcher().search(query, Integer.MAX_VALUE);
final List documents = new ArrayList<>((int) topDocs.totalHits.value);
for (int i = 0; i < topDocs.totalHits.value; i++) {
documents.add(getIndexSearcher().doc(topDocs.scoreDocs[i].doc));
}
return documents;
} catch (IOException e) {
throw new RuntimeException("Unable to search index for [" + query + "]", e);
} catch (Exception e) {
throw new RuntimeException("Unknown exception whilst performing query [" + query + "] on index in " + this.indexIntoDirectory.getAbsolutePath() + " ; make sure that the index exists and is valid", e);
}
}
protected boolean isIndexingRequired() {
this.setLastCheckedToNow();
final long directoryLastModified = getDirectoryLastModified();
if (directoryLastModified == 0L) {
LOG.info("Index directory is empty, indexing is required");
return true;
}
for (File requiredDownload : this.requiredDownloads) {
final long downloadLastModified = propertyFiles.getLong(requiredDownload, "info", InfoFileAttributes.LAST_UPDATED.getKey())
.orElse(0L);
if (downloadLastModified == 0) {
LOG.info("Required download is empty, attempting to index");
return true;
}
if (downloadLastModified > directoryLastModified) {
LOG.info("Index is out of date, download [{}] is more recent [{}] --> [{}]", requiredDownload.getName(),
TimeUtils.formatNormalizedDate(new Date(downloadLastModified)),
TimeUtils.formatNormalizedDate(new Date(directoryLastModified)));
return true;
}
final boolean hasLastDownloadFailed = propertyFiles.getBoolean(requiredDownload, "info", InfoFileAttributes.DOWNLOAD_FAILED_FLAG.getKey())
.orElse(false);
if (hasLastDownloadFailed) {
LOG.info("Last download failed, attempting to index");
return true;
}
final boolean hasLastIndexFailed = hasLastIndexFailed();
if (hasLastIndexFailed) {
LOG.info("Last index failed, attempting to index");
return true;
}
}
if (this.requiredIndexes != null) {
for (Index requiredIndex : this.requiredIndexes) {
final long indexLastModified = propertyFiles.getLong(requiredIndex.getIndexIntoDirectory(), "info", InfoFileAttributes.LAST_UPDATED.getKey())
.orElse(0L);
if (indexLastModified == 0) {
continue;
}
if (indexLastModified > directoryLastModified) {
LOG.info("Index is out of date, index [{}] is more recent [{}] --> [{}]", requiredIndex.getIndexIntoDirectory().getName(),
TimeUtils.formatNormalizedDate(new Date(indexLastModified)),
TimeUtils.formatNormalizedDate(new Date(directoryLastModified)));
return true;
}
}
}
return false;
}
public boolean hasLastIndexFailed() {
return propertyFiles.getBoolean(this.indexIntoDirectory, "info", InfoFileAttributes.INDEX_FAILED_FLAG.getKey())
.orElse(false);
}
public long getDirectoryLastModified() {
return propertyFiles.getLong(this.indexIntoDirectory, "info", InfoFileAttributes.LAST_UPDATED.getKey())
.orElse(0L);
}
private void setLastUpdatedToNow() {
final long now = TimeUtils.utcNow();
super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.LAST_UPDATED.getKey(), now);
super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.LAST_UPDATED_FORMATTED.getKey(), new Date(now));
super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.MIRROR_VERSION.getKey(), BuildProperties.getProjectVersion());
LOG.info("Set last updated to [{}] in {}", new Date(now), indexIntoDirectory);
}
private void setLastCheckedToNow() {
final long now = TimeUtils.utcNow();
super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.LAST_CHECKED.getKey(), now);
super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.LAST_CHECKED_FORMATTED.getKey(), new Date(now));
LOG.info("Set last updated to [{}] in {}", new Date(now), indexIntoDirectory);
}
private void assertRequiredDownloadsExist() {
final List missingDownloads = new ArrayList<>();
final List failedDownloads = new ArrayList<>();
for (File requiredDownload : requiredDownloads) {
if (!requiredDownload.exists()) {
missingDownloads.add(requiredDownload);
}
final boolean hasLastDownloadFailed = propertyFiles.getBoolean(requiredDownload, "info", InfoFileAttributes.DOWNLOAD_FAILED_FLAG.getKey())
.orElse(false);
if (hasLastDownloadFailed) {
failedDownloads.add(requiredDownload);
}
}
final StringBuilder message = new StringBuilder();
if (!missingDownloads.isEmpty()) {
message.append("Cannot create index [").append(this.getClass().getSimpleName()).append("], required downloads are missing: ");
message.append(missingDownloads.stream().map(File::getName).collect(Collectors.joining(", ")));
}
if (!failedDownloads.isEmpty()) {
if (message.length() > 0) {
message.append(System.lineSeparator());
}
message.append("Cannot create index, required downloads have failed (re-download the data to fix corrupted download): ");
message.append(failedDownloads.stream().map(File::getName).collect(Collectors.joining(", ")));
}
if (message.length() > 0) {
throw new RuntimeException(message.toString());
}
}
public Index clearIndex() {
try {
propertyFiles.flushCachedAePropertyFiles();
FileUtils.deleteDir(this.indexIntoDirectory);
LOG.info("Cleared index in {}", this.indexIntoDirectory.getAbsolutePath());
} catch (Exception e) {
LOG.warn("Unable to clear index in " + this.indexIntoDirectory.getAbsolutePath());
throw e;
}
return this;
}
/**
* Creates {@link org.apache.lucene.document.Document}s that will be included in the index.
* To be able to identify documents, a unique identifier must be provided as key.
*
* @return A map of String identifiers with a {@link org.apache.lucene.document.Document} each.
*/
protected abstract Map createIndexDocuments();
protected final static String UNIQUE_LUCENE_DOCUMENT_ID = "uldid";
protected List getAllFilesInSubDirectories(File directory) {
final List files = new ArrayList<>();
final File[] subDirectories = directory.listFiles();
if (subDirectories == null) {
throw new RuntimeException("Could not list files in " + directory.getAbsolutePath());
}
for (File dir : subDirectories) {
if (!dir.isDirectory()) continue;
final File[] noteFiles = dir.listFiles();
if (noteFiles == null) {
throw new RuntimeException("Could not list files in " + dir.getAbsolutePath());
}
for (File noteFile : noteFiles) {
if (noteFile.isFile()) {
files.add(noteFile);
}
}
}
return files;
}
protected Collection getAllFilesRecursively(File directory) {
return FileUtils.listFiles(directory, null, true);
}
protected org.w3c.dom.Document parseXmlDocument(String string) throws ParserConfigurationException, IOException, SAXException {
if (StringUtils.isEmpty(string))
throw new IllegalArgumentException("String cannot be empty to create XML document");
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setIgnoringElementContentWhitespace(true);
DocumentBuilder builder = factory.newDocumentBuilder();
return builder.parse(new ByteArrayInputStream(string.getBytes(StandardCharsets.UTF_8)));
}
public static String getDirectoryIdentifier(Class extends Index> index) {
if (index.isAnnotationPresent(MirrorMetadata.class)) {
return index.getAnnotation(MirrorMetadata.class).directoryName();
}
LOG.warn("Index {} does not have a directory name annotation", index.getSimpleName());
// TODO: remove these, covered by Annotation above
if (CertFrAdvisorIndex.class.equals(index)) {
return "certfr-advisors";
} else if (CertSeiAdvisorIndex.class.equals(index)) {
return "certsei-advisors";
} else if (CpeDictionaryVendorProductIndex.class.equals(index)) {
return "cpe-dict-vp-legacy-feed";
} else if (NvdCpeApiIndex.class.equals(index) || CpeDictionaryIndex.class.equals(index)) {
return "cpe-dict";
} else if (MsrcAdvisorIndex.class.equals(index)) {
return "msrc-advisors";
} else if (MsrcProductIndex.class.equals(index)) {
return "msrc-products";
} else if (MsrcKbChainIndex.class.equals(index)) {
return "msrc-kb-chains";
} else if (NvdVulnerabilityIndex.class.equals(index)) {
return "nvd-cve-legacy-feed";
} else if (NvdCveApiIndex.class.equals(index)) {
return "nvd-cve";
}
throw new RuntimeException("Unknown index class: " + index);
}
public static Index getInstance(Class extends Index> clazz, File baseMirrorDirectory) {
try {
return clazz.getConstructor(File.class)
.newInstance(baseMirrorDirectory);
} catch (Exception e) {
throw new RuntimeException("Unable to create index class", e);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy