All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.metaeffekt.mirror.index.Index Maven / Gradle / Ivy

There is a newer version: 0.134.0
Show newest version
/*
 * Copyright 2021-2024 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.metaeffekt.mirror.index;

import com.metaeffekt.artifact.analysis.utils.BuildProperties;
import com.metaeffekt.artifact.analysis.utils.FileUtils;
import com.metaeffekt.artifact.analysis.utils.StringUtils;
import com.metaeffekt.artifact.analysis.utils.TimeUtils;
import com.metaeffekt.mirror.Mirror;
import com.metaeffekt.mirror.download.documentation.MirrorMetadata;
import com.metaeffekt.mirror.concurrency.ScheduledDelayedThreadPoolExecutor;
import com.metaeffekt.mirror.download.Download;
import com.metaeffekt.mirror.index.advisor.*;
import com.metaeffekt.mirror.index.nvd.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
 * Uses lucene index to create an index of the data specified by the subclasses.
 */
public abstract class Index extends Mirror {

    private final static Logger LOG = LoggerFactory.getLogger(Index.class);

    protected final File indexIntoDirectory;

    protected int lockFileTimeout = 10 * 60 * 1000; // 10 minutes

    protected final ScheduledDelayedThreadPoolExecutor executor = new ScheduledDelayedThreadPoolExecutor(16, 0);

    protected final Directory luceneDirectory;
    private IndexReader cachedIndexReader;
    private org.apache.lucene.search.IndexSearcher cachedIndexSearcher;

    protected final File[] requiredDownloads;
    protected final Index[] requiredIndexes;

    protected final File[] optionalDownloads;
    protected final Index[] optionalIndexes;

    public Index(File baseMirrorDirectory, Class indexClass, List> requiredDownloads, List> requiredIndexes) {
        this(baseMirrorDirectory, indexClass, requiredDownloads, requiredIndexes, Collections.emptyList(), Collections.emptyList());
    }

    public Index(File baseMirrorDirectory, Class indexClass, List> requiredDownloads, List> requiredIndexes, List> optionalDownloads, List> optionalIndexes) {
        super(baseMirrorDirectory, Index.getDirectoryIdentifier(indexClass));
        this.indexIntoDirectory = new File(new File(super.baseMirrorDirectory, "index"), super.mirrorIdentifier);

        if (!this.indexIntoDirectory.exists()) {
            this.indexIntoDirectory.mkdirs();
        }
        try {
            luceneDirectory = FSDirectory.open(indexIntoDirectory.toPath());
        } catch (IOException e) {
            throw new RuntimeException("Unable to open lucene index directory inside " + baseMirrorDirectory.getAbsolutePath(), e);
        }

        this.requiredDownloads = new File[requiredDownloads.size()];
        for (int i = 0; i < requiredDownloads.size(); i++) {
            this.requiredDownloads[i] = new File(new File(super.baseMirrorDirectory, "download"), Download.getDirectoryIdentifier(requiredDownloads.get(i)));

            if (!this.requiredDownloads[i].exists()) {
                LOG.debug("Required download does not exist in {} for index [{}] (this can be ignored if only read access is required) in {}", this.requiredDownloads[i].getClass().getSimpleName(), mirrorIdentifier, getClass().getName());
            }
        }

        this.requiredIndexes = new Index[requiredIndexes.size()];
        for (int i = 0; i < requiredIndexes.size(); i++) {
            this.requiredIndexes[i] = Index.getInstance(requiredIndexes.get(i), baseMirrorDirectory);

            try {
                this.requiredIndexes[i].assertExists();
            } catch (Exception e) {
                LOG.debug("Required index does not exist in {} for index [{}] (this can be ignored if only read access is required): {} in {}", this.requiredIndexes[i].getClass().getSimpleName(), mirrorIdentifier, e.getMessage(), getClass().getSimpleName());
            }
        }

        this.optionalDownloads = new File[optionalDownloads.size()];
        for (int i = 0; i < optionalDownloads.size(); i++) {
            this.optionalDownloads[i] = new File(new File(super.baseMirrorDirectory, "download"), Download.getDirectoryIdentifier(optionalDownloads.get(i)));

            if (!this.optionalDownloads[i].exists()) {
                LOG.debug("Optional download does not exist in {} for index [{}] (this can be ignored if index is not required) in {}", this.optionalDownloads[i].getClass().getSimpleName(), mirrorIdentifier, getClass().getName());
            }
        }

        this.optionalIndexes = new Index[optionalIndexes.size()];
        for (int i = 0; i < optionalIndexes.size(); i++) {
            this.optionalIndexes[i] = Index.getInstance(optionalIndexes.get(i), baseMirrorDirectory);

            try {
                this.optionalIndexes[i].assertExists();
            } catch (Exception e) {
                LOG.debug("Optional index does not exist in {} for index [{}] (this can be ignored if index is not required): {} in {}", this.optionalIndexes[i].getClass().getSimpleName(), mirrorIdentifier, e.getMessage(), getClass().getSimpleName());
            }
        }
    }

    public void setLockFileTimeout(int lockFileTimeout) {
        this.lockFileTimeout = lockFileTimeout;
    }

    public  T getRequiredIndex(Class indexClass) {
        for (Index index : this.requiredIndexes) {
            if (indexClass.isInstance(index)) {
                return indexClass.cast(index);
            }
        }
        return null;
    }

    public File getIndexIntoDirectory() {
        return indexIntoDirectory;
    }

    public void assertExists() {
        if (!this.indexIntoDirectory.exists()) {
            throw new RuntimeException("Index directory does not exist: " + this.indexIntoDirectory);
        } else if (!this.indexIntoDirectory.isDirectory()) {
            throw new RuntimeException("Index directory is not a directory: " + this.indexIntoDirectory);
        } else if (this.indexIntoDirectory.listFiles().length < 2) {
            throw new RuntimeException("Index directory is empty: " + this.indexIntoDirectory);
        }
    }

    public void createIndexIfRequired() {
        boolean outcome = false;
        try {
            super.logTitle("");
            super.waitForFileUnlockIfLocked(indexIntoDirectory, lockFileTimeout);
            super.lockFile(indexIntoDirectory);

            if (isIndexingRequired()) {
                createIndexBackup();
                clearIndex();
                createIndex();
                setLastUpdatedToNow();
            } else {
                LOG.info("Index is already up to date: {}", indexIntoDirectory);
            }

            super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.INDEX_FAILED_FLAG.getKey(), "false");
            outcome = true;

        } catch (Exception e) {
            try {
                super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.INDEX_FAILED_FLAG.getKey(), "true");
            } catch (Exception ignored) {
            }
            loadIndexBackup();
            throw new RuntimeException("Unable to update index in " + indexIntoDirectory + "\n" + e.getMessage(), e);
        } finally {
            removeIndexBackup();
            super.unlockFile(indexIntoDirectory);
            if (outcome) {
                super.logTitle("Done: ");
            } else {
                super.logTitle("FAILED: ");
            }
        }
    }

    protected void createIndex() {
        assertRequiredDownloadsExist();

        LOG.info("Creating index documents for index in: {}", indexIntoDirectory);
        final Map indexDocuments = createIndexDocuments();
        if (indexDocuments.size() > 0) {
            writeIndexDocuments(indexDocuments);
        }
    }

    private void createIndexBackup() {
        try {
            final File backupDirectory = new File(indexIntoDirectory.getParentFile(), indexIntoDirectory.getName() + "-backup");
            if (backupDirectory.exists()) {
                FileUtils.cleanDirectory(backupDirectory);
            }
            if (!indexIntoDirectory.exists()) {
                LOG.warn("Index directory does not exist, unable to create backup: {}", indexIntoDirectory);
                return;
            }
            FileUtils.copyDirectory(indexIntoDirectory, backupDirectory);
            LOG.info("Created index backup in: {}", backupDirectory);
        } catch (IOException e) {
            LOG.error("Unable to create index backup for " + indexIntoDirectory.getAbsolutePath(), e);
        }
    }

    private void loadIndexBackup() {
        try {
            final File backupDirectory = new File(indexIntoDirectory.getParentFile(), indexIntoDirectory.getName() + "-backup");
            if (!backupDirectory.exists()) {
                LOG.error("Unable to load index backup for " + indexIntoDirectory.getAbsolutePath() + " as it does not exist");
                return;
            }
            if (indexIntoDirectory.exists()) {
                FileUtils.cleanDirectory(indexIntoDirectory);
            }
            FileUtils.copyDirectory(backupDirectory, indexIntoDirectory);
            LOG.info("Loaded index backup from: {}", backupDirectory);
        } catch (IOException e) {
            LOG.error("Unable to load index backup for " + indexIntoDirectory.getAbsolutePath(), e);
        }
    }

    private void removeIndexBackup() {
        final File backupDirectory = new File(indexIntoDirectory.getParentFile(), indexIntoDirectory.getName() + "-backup");
        if (backupDirectory.exists()) {
            FileUtils.deleteDir(backupDirectory);
        }
    }

    protected void writeIndexDocuments(Map indexDocuments) {
        LOG.info("Indexing [{}] document{}", indexDocuments.size(), indexDocuments.size() == 1 ? "" : "s");

        try {
            final IndexWriterConfig luceneWriterConfig = new IndexWriterConfig(Analyzers.getStandardAnalyzer());
            luceneWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);

            final IndexWriter luceneDocumentWriter = new IndexWriter(luceneDirectory, luceneWriterConfig);

            for (Map.Entry indexDocument : indexDocuments.entrySet()) {
                if (indexDocument.getKey() == null) {
                    LOG.warn("Index document has no key: {}", indexDocument.getValue());
                    continue;
                }

                // https://stackoverflow.com/questions/73484766/lucene-index-query-does-not-find-document-if-too-many-documents-similar-document
                final Term term = new Term("doc_id", String.valueOf(indexDocument.getKey().hashCode()));

                luceneDocumentWriter.updateDocument(term, indexDocument.getValue());
            }

            luceneDocumentWriter.close();
            resetCachedReaderSearcher();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

        indexDocuments.clear();
    }

    private void resetCachedReaderSearcher() {
        if (cachedIndexReader != null) {
            try {
                cachedIndexReader.close();
            } catch (IOException ignored) {
            }
        }
        cachedIndexReader = null;
        cachedIndexSearcher = null;
    }

    private IndexReader getIndexReader() throws IOException {
        if (cachedIndexReader == null) {
            try {
                cachedIndexReader = DirectoryReader.open(luceneDirectory);
            } catch (IOException ignored) {
            }
        }
        return cachedIndexReader;
    }

    private org.apache.lucene.search.IndexSearcher getIndexSearcher() throws IOException {
        if (cachedIndexSearcher == null) {
            cachedIndexSearcher = new org.apache.lucene.search.IndexSearcher(getIndexReader());
        }
        return cachedIndexSearcher;
    }

    public List findAllDocuments() {
        try {
            final IndexReader indexReader = getIndexReader();
            final int docCount = indexReader.numDocs();
            final List documents = new ArrayList<>(docCount);
            for (int i = 0; i < docCount; i++) {
                documents.add(indexReader.document(i));
            }
            return documents;
        } catch (IOException e) {
            throw new RuntimeException("Unable to find all documents in index", e);
        }
    }

    public int documentCount() {
        try {
            return getIndexReader().numDocs();
        } catch (IOException e) {
            throw new RuntimeException("Unable to find all documents in index", e);
        }
    }

    public void findAndProcessAllDocuments(Consumer consumer) {
        try {
            final IndexReader indexReader = getIndexReader();
            final int docCount = indexReader.numDocs();
            for (int i = 0; i < docCount; i++) {
                consumer.accept(indexReader.document(i));
            }
        } catch (IOException e) {
            throw new RuntimeException("Unable to find all documents in index", e);
        }
    }

    public void findAndProcessAllDocumentsCancelable(Function consumer) {
        try {
            final IndexReader indexReader = getIndexReader();
            final int docCount = indexReader.numDocs();
            for (int i = 0; i < docCount; i++) {
                final boolean continueSearch = consumer.apply(indexReader.document(i));
                if (!continueSearch) break;
            }
        } catch (IOException e) {
            throw new RuntimeException("Unable to find all documents in index", e);
        }
    }

    public void findAndProcessAllDocuments(BiConsumer consumer) {
        try {
            final IndexReader indexReader = getIndexReader();
            final int docCount = indexReader.numDocs();
            for (int i = 0; i < docCount; i++) {
                consumer.accept(indexReader.document(i), i);
            }
        } catch (IOException e) {
            throw new RuntimeException("Unable to find all documents in index", e);
        }
    }

    public List findDocuments(IndexSearch indexSearch) {
        try {
            return indexSearch.search(getIndexSearcher());
        } catch (IOException e) {
            throw new RuntimeException("Unable to search index for [" + indexSearch + "]", e);
        } catch (ParseException e) {
            throw new RuntimeException("Malformed search query for [" + indexSearch + "]", e);
        } catch (Exception e) {
            throw new RuntimeException("Unknown exception whilst performing query [" + indexSearch + "] on index in " + this.indexIntoDirectory.getAbsolutePath() + " ; make sure that the index exists and is valid", e);
        }
    }

    public List findDocuments(Query query) {
        try {
            final TopDocs topDocs = getIndexSearcher().search(query, Integer.MAX_VALUE);

            final List documents = new ArrayList<>((int) topDocs.totalHits.value);
            for (int i = 0; i < topDocs.totalHits.value; i++) {
                documents.add(getIndexSearcher().doc(topDocs.scoreDocs[i].doc));
            }

            return documents;
        } catch (IOException e) {
            throw new RuntimeException("Unable to search index for [" + query + "]", e);
        } catch (Exception e) {
            throw new RuntimeException("Unknown exception whilst performing query [" + query + "] on index in " + this.indexIntoDirectory.getAbsolutePath() + " ; make sure that the index exists and is valid", e);
        }
    }

    protected boolean isIndexingRequired() {
        this.setLastCheckedToNow();

        final long directoryLastModified = getDirectoryLastModified();

        if (directoryLastModified == 0L) {
            LOG.info("Index directory is empty, indexing is required");
            return true;
        }

        for (File requiredDownload : this.requiredDownloads) {
            final long downloadLastModified = propertyFiles.getLong(requiredDownload, "info", InfoFileAttributes.LAST_UPDATED.getKey())
                    .orElse(0L);

            if (downloadLastModified == 0) {
                LOG.info("Required download is empty, attempting to index");
                return true;
            }

            if (downloadLastModified > directoryLastModified) {
                LOG.info("Index is out of date, download [{}] is more recent [{}] --> [{}]", requiredDownload.getName(),
                        TimeUtils.formatNormalizedDate(new Date(downloadLastModified)),
                        TimeUtils.formatNormalizedDate(new Date(directoryLastModified)));
                return true;
            }

            final boolean hasLastDownloadFailed = propertyFiles.getBoolean(requiredDownload, "info", InfoFileAttributes.DOWNLOAD_FAILED_FLAG.getKey())
                    .orElse(false);
            if (hasLastDownloadFailed) {
                LOG.info("Last download failed, attempting to index");
                return true;
            }

            final boolean hasLastIndexFailed = hasLastIndexFailed();
            if (hasLastIndexFailed) {
                LOG.info("Last index failed, attempting to index");
                return true;
            }
        }

        if (this.requiredIndexes != null) {
            for (Index requiredIndex : this.requiredIndexes) {
                final long indexLastModified = propertyFiles.getLong(requiredIndex.getIndexIntoDirectory(), "info", InfoFileAttributes.LAST_UPDATED.getKey())
                        .orElse(0L);

                if (indexLastModified == 0) {
                    continue;
                }

                if (indexLastModified > directoryLastModified) {
                    LOG.info("Index is out of date, index [{}] is more recent [{}] --> [{}]", requiredIndex.getIndexIntoDirectory().getName(),
                            TimeUtils.formatNormalizedDate(new Date(indexLastModified)),
                            TimeUtils.formatNormalizedDate(new Date(directoryLastModified)));
                    return true;
                }
            }
        }

        return false;
    }

    public boolean hasLastIndexFailed() {
        return propertyFiles.getBoolean(this.indexIntoDirectory, "info", InfoFileAttributes.INDEX_FAILED_FLAG.getKey())
                .orElse(false);
    }

    public long getDirectoryLastModified() {
        return propertyFiles.getLong(this.indexIntoDirectory, "info", InfoFileAttributes.LAST_UPDATED.getKey())
                .orElse(0L);
    }

    private void setLastUpdatedToNow() {
        final long now = TimeUtils.utcNow();
        super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.LAST_UPDATED.getKey(), now);
        super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.LAST_UPDATED_FORMATTED.getKey(), new Date(now));
        super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.MIRROR_VERSION.getKey(), BuildProperties.getProjectVersion());

        LOG.info("Set last updated to [{}] in {}", new Date(now), indexIntoDirectory);
    }

    private void setLastCheckedToNow() {
        final long now = TimeUtils.utcNow();
        super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.LAST_CHECKED.getKey(), now);
        super.propertyFiles.set(indexIntoDirectory, "info", InfoFileAttributes.LAST_CHECKED_FORMATTED.getKey(), new Date(now));

        LOG.info("Set last updated to [{}] in {}", new Date(now), indexIntoDirectory);
    }

    private void assertRequiredDownloadsExist() {
        final List missingDownloads = new ArrayList<>();
        final List failedDownloads = new ArrayList<>();

        for (File requiredDownload : requiredDownloads) {
            if (!requiredDownload.exists()) {
                missingDownloads.add(requiredDownload);
            }

            final boolean hasLastDownloadFailed = propertyFiles.getBoolean(requiredDownload, "info", InfoFileAttributes.DOWNLOAD_FAILED_FLAG.getKey())
                    .orElse(false);
            if (hasLastDownloadFailed) {
                failedDownloads.add(requiredDownload);
            }
        }

        final StringBuilder message = new StringBuilder();
        if (!missingDownloads.isEmpty()) {
            message.append("Cannot create index [").append(this.getClass().getSimpleName()).append("], required downloads are missing: ");
            message.append(missingDownloads.stream().map(File::getName).collect(Collectors.joining(", ")));
        }
        if (!failedDownloads.isEmpty()) {
            if (message.length() > 0) {
                message.append(System.lineSeparator());
            }
            message.append("Cannot create index, required downloads have failed (re-download the data to fix corrupted download): ");
            message.append(failedDownloads.stream().map(File::getName).collect(Collectors.joining(", ")));
        }

        if (message.length() > 0) {
            throw new RuntimeException(message.toString());
        }
    }

    public Index clearIndex() {
        try {
            propertyFiles.flushCachedAePropertyFiles();
            FileUtils.deleteDir(this.indexIntoDirectory);
            LOG.info("Cleared index in {}", this.indexIntoDirectory.getAbsolutePath());
        } catch (Exception e) {
            LOG.warn("Unable to clear index in " + this.indexIntoDirectory.getAbsolutePath());
            throw e;
        }
        return this;
    }

    /**
     * Creates {@link org.apache.lucene.document.Document}s that will be included in the index.
* To be able to identify documents, a unique identifier must be provided as key. * * @return A map of String identifiers with a {@link org.apache.lucene.document.Document} each. */ protected abstract Map createIndexDocuments(); protected final static String UNIQUE_LUCENE_DOCUMENT_ID = "uldid"; protected List getAllFilesInSubDirectories(File directory) { final List files = new ArrayList<>(); final File[] subDirectories = directory.listFiles(); if (subDirectories == null) { throw new RuntimeException("Could not list files in " + directory.getAbsolutePath()); } for (File dir : subDirectories) { if (!dir.isDirectory()) continue; final File[] noteFiles = dir.listFiles(); if (noteFiles == null) { throw new RuntimeException("Could not list files in " + dir.getAbsolutePath()); } for (File noteFile : noteFiles) { if (noteFile.isFile()) { files.add(noteFile); } } } return files; } protected Collection getAllFilesRecursively(File directory) { return FileUtils.listFiles(directory, null, true); } protected org.w3c.dom.Document parseXmlDocument(String string) throws ParserConfigurationException, IOException, SAXException { if (StringUtils.isEmpty(string)) throw new IllegalArgumentException("String cannot be empty to create XML document"); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setIgnoringElementContentWhitespace(true); DocumentBuilder builder = factory.newDocumentBuilder(); return builder.parse(new ByteArrayInputStream(string.getBytes(StandardCharsets.UTF_8))); } public static String getDirectoryIdentifier(Class index) { if (index.isAnnotationPresent(MirrorMetadata.class)) { return index.getAnnotation(MirrorMetadata.class).directoryName(); } LOG.warn("Index {} does not have a directory name annotation", index.getSimpleName()); // TODO: remove these, covered by Annotation above if (CertFrAdvisorIndex.class.equals(index)) { return "certfr-advisors"; } else if (CertSeiAdvisorIndex.class.equals(index)) { return "certsei-advisors"; } else if (CpeDictionaryVendorProductIndex.class.equals(index)) { return "cpe-dict-vp-legacy-feed"; } else if (NvdCpeApiIndex.class.equals(index) || CpeDictionaryIndex.class.equals(index)) { return "cpe-dict"; } else if (MsrcAdvisorIndex.class.equals(index)) { return "msrc-advisors"; } else if (MsrcProductIndex.class.equals(index)) { return "msrc-products"; } else if (MsrcKbChainIndex.class.equals(index)) { return "msrc-kb-chains"; } else if (NvdVulnerabilityIndex.class.equals(index)) { return "nvd-cve-legacy-feed"; } else if (NvdCveApiIndex.class.equals(index)) { return "nvd-cve"; } throw new RuntimeException("Unknown index class: " + index); } public static Index getInstance(Class clazz, File baseMirrorDirectory) { try { return clazz.getConstructor(File.class) .newInstance(baseMirrorDirectory); } catch (Exception e) { throw new RuntimeException("Unable to create index class", e); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy