com.metaeffekt.mirror.index.advisor.CertFrAdvisorIndex Maven / Gradle / Ivy
/*
* Copyright 2021-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metaeffekt.mirror.index.advisor;
import com.metaeffekt.artifact.analysis.utils.FileUtils;
import com.metaeffekt.mirror.download.documentation.MirrorMetadata;
import com.metaeffekt.mirror.contents.advisory.CertFrAdvisorEntry;
import com.metaeffekt.mirror.download.advisor.CertFrDownload;
import com.metaeffekt.mirror.download.documentation.DocRelevantMethods;
import com.metaeffekt.mirror.index.Index;
import org.apache.lucene.document.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* TXT
* The TXT files provided by the CERT-FR are mere transcriptions of PDF files and are highly unstructured, lacking proper
* segmentation, including header and footer on each page and PDF-specific formatting symbols. Each document
* contains a table in the header that provides some general information, such as the document ID and dates. The table rows
* appear in random order from document to document.
* After the header, there are multiple titles followed by text content. These headers are not normalized, with over 1800 unique
* headers, where sometimes there are variations of headers and some appear only once. After detecting the paragraphs, they
* are collected with their respective header and text content. Some titles are normalized to an English identifier for
* consistency.
*
* - Iterate over all TXT files in the directory
* - Read the contents of each file
* - Use
CertFrAdvisorEntry.fromDownloadText
to map the information to a CertFrAdvisorEntry
object
*
*
* Mapping of TXT content to CertFrAdvisorEntry
fields
*
*
* TXT
* CertSeiAdvisorEntry
*
*
*
*
* table > first line that matches CERT-FR identifier format
* id
*
*
* table > Date de la première version
OR first line that matches date
* createDate
*
*
* table > Date de la dernière version
OR second line that matches date
* updateDate
*
*
* Documentation
> every second line contains an URL with the line above being the title
* references
*
*
* CERT-FR and CVEs in the entire text content
* referenceIds
*
*
* Summar
, header > Object:
* summary
*
*
* (if Documentation
is absent: table > sources
) & Description
& all other further unstructured paragraphs that have not been used otherwise
* description
*
*
* Risk
* threat
*
*
* Recommendations
& Solution
* recommendation
*
*
* Temporary bypass
* workarounds
*
*
*
* Valid table headers are:
*
* Systèmes affectés
, Affected systems
* Résumé
, Summary
* Risque\\(s\\)
, Risques
, Risque
, Risk
* Solution
, Solutions
, Solution
* Recommandations
, Recommendations
* Documentation
, Documentations
, Documentation
* Contournement
provisoire
, Temporary bypass
* Description
, Description
* Rappel des avis émis
, Rappel des avis et des mises à jour émis
, Rappel des avis et mises à jour émis
,
* Reminder of notices issued
* ^\\d{1,2} .+
*
* JSON
* Luckily, there is an alternative: Our downloader-preprocessing also builds JSON files from the API, which are much more structured.
* The JSON files contain the same information as the TXT files, but in a structured format.
* The fields used are the same as in the TXT files.
*/
@MirrorMetadata(directoryName = "certfr-advisors", mavenPropertyName = "certFrAdvisorIndex")
public class CertFrAdvisorIndex extends Index {
private final static Logger LOG = LoggerFactory.getLogger(CertFrAdvisorIndex.class);
public CertFrAdvisorIndex(File baseMirrorDirectory) {
super(baseMirrorDirectory, CertFrAdvisorIndex.class, Collections.singletonList(CertFrDownload.class), Collections.emptyList());
}
@Override
@DocRelevantMethods({"CertFrAdvisorEntry#fromDownloadText", "CertFrAdvisorEntry#fromApiJson"})
protected Map createIndexDocuments() {
final Map documents = new ConcurrentHashMap<>();
final List files = super.getAllFilesInSubDirectories(super.requiredDownloads[0]);
for (File file : files) {
super.executor.submit(() -> {
try {
final CertFrAdvisorEntry parsedEntry;
if (file.getName().endsWith(".txt")) {
final List contents = FileUtils.readLines(file, StandardCharsets.UTF_8);
parsedEntry = CertFrAdvisorEntry.fromDownloadText(contents);
} else if (file.getName().endsWith(".json")) {
parsedEntry = CertFrAdvisorEntry.fromApiJson(file);
} else {
LOG.warn("Unsupported file format, skipping: {}", file.getAbsolutePath());
return;
}
if (documents.containsKey(parsedEntry.getId())) {
LOG.warn("Duplicate entry found, skipping: {}", parsedEntry.toJson());
} else {
documents.put(parsedEntry.getId(), parsedEntry.toDocument());
}
} catch (IOException e) {
throw new RuntimeException("Unable to read file contents during indexing: " + file.getAbsolutePath(), e);
} catch (Exception e) {
throw new RuntimeException("Unable to parse file content during indexing: " + file.getAbsolutePath(), e);
}
});
}
super.executor.setSize(16);
super.executor.start();
try {
super.executor.join();
} catch (InterruptedException e) {
throw new RuntimeException("Failed to wait for indexing to complete.", e);
}
return documents;
}
}