com.metaeffekt.mirror.download.advisor.CertEuDownload Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2021-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metaeffekt.mirror.download.advisor;
import com.metaeffekt.artifact.analysis.utils.FileUtils;
import com.metaeffekt.artifact.analysis.utils.StringUtils;
import com.metaeffekt.mirror.download.documentation.MirrorMetadata;
import com.metaeffekt.mirror.Retry;
import com.metaeffekt.mirror.download.Download;
import com.metaeffekt.mirror.download.ResourceLocation;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* References:
*
* - CERT-EU Security Advisories: https://cert.europa.eu/publications/security-advisories
*
* CERT-EU provides a list of security advisories that are published on their website.
* The advisories are published in a structured manner, with each advisory being contained in a separate JSON file.
* The downloader will fetch the list of advisories from the CERT-EU website and download the JSON files for each advisory.
* The advisories are structured in a yearly manner, with each year containing a list of advisories published in that year.
*/
@Slf4j
@MirrorMetadata(directoryName = "certeu", mavenPropertyName = "certEuDownload")
public class CertEuDownload extends Download {
private final static Logger LOG = LoggerFactory.getLogger(CertEuDownload.class);
private final static Pattern PATTERN_YEARLY_PUBLICATIONS_YEAR_EXTRACTION = Pattern.compile(".*publications/security-advisories/(\\d{4,5})\".*");
private final static Pattern PATTERN_YEARLY_PUBLICATIONS_ENTRY_ID_EXTRACTION = Pattern.compile("/publications/security-advisories/(\\d{4,5}-\\d{3,5})/");
private final File yearlyPublicationsDirectory = new File(super.getDownloadIntoDirectory(), "publications-summary");
private final File publicationEntries = new File(super.getDownloadIntoDirectory(), "publications");
private final File previousPublicationDatesFile = new File(this.yearlyPublicationsDirectory, "publication-dates.json");
public CertEuDownload(File baseMirrorDirectory) {
super(baseMirrorDirectory, CertEuDownload.class);
}
@Override
protected void performDownload() {
final Map> yearlyPublicationHtmlPages = getAllYearlyPublications();
final Map> entriesToBeFetchedPerYear = new LinkedHashMap<>();
for (Map.Entry> yearPage : yearlyPublicationHtmlPages.entrySet()) {
try {
final Map entriesToBeFetched = extractUpdatedEntryIdsForYearlyPublicationsHtml(yearPage.getKey(), yearPage.getValue());
if (!entriesToBeFetched.isEmpty()) {
entriesToBeFetchedPerYear.put(yearPage.getKey(), entriesToBeFetched);
} else {
log.info("Year [{}] is already complete and up to date, no need to fetch.", yearPage.getKey());
}
} catch (IOException e) {
throw new RuntimeException("Failed to extract the yearly publication entries for CERT-EU on year " + yearPage.getKey(), e);
}
}
if (entriesToBeFetchedPerYear.isEmpty()) {
log.info("No new entries to fetch, skipping download.");
return;
}
for (Map.Entry> yearEntries : entriesToBeFetchedPerYear.entrySet()) {
final Map entriesToBeFetched = yearEntries.getValue();
final int year = yearEntries.getKey();
log.info("Starting fetching process for year [{}] with [{}] entries to be fetched.", year, entriesToBeFetched.size());
for (Map.Entry entry : entriesToBeFetched.entrySet()) {
final String entryId = entry.getKey();
downloadEntry(year, entryId);
}
try {
this.mergePreviouslyParsedEntryPublicationDatesBackIntoFile(entriesToBeFetched);
} catch (IOException e) {
log.error("Failed to merge the fetched entries back into the publication dates file, this means that next time, all entries for year [{}] will be fetched again. This error should not occur, as it is a simple file read/write access.", year, e);
}
}
}
@Override
protected boolean additionalIsDownloadRequired() {
final String localRssHash = getLocalRssHash();
final String onlineRssHash = getOnlineRssHash();
if (localRssHash.equals(onlineRssHash)) {
return false;
} else {
log.info("RSS Feed changed since last execution, mirror is required");
super.propertyFiles.set(super.getDownloadIntoDirectory(), "info", InfoFileAttributes.CERT_EU_PREFIX.getKey() + "rss-feed-sha256", onlineRssHash);
return true;
}
}
private String getOnlineRssHash() {
final URL url = getRemoteResourceLocationUrl(ResourceLocationCertEu.RSS_FEED);
final String rssContent = String.join("", super.downloader.fetchResponseBodyFromUrlAsList(url));
return DigestUtils.sha256Hex(rssContent);
}
private String getLocalRssHash() {
return super.propertyFiles.getString(super.getDownloadIntoDirectory(), "info", InfoFileAttributes.CERT_EU_PREFIX.getKey() + "rss-feed-sha256")
.orElse("");
}
private Map> getAllYearlyPublications() {
final Map> yearlyPublicationHtmlPages = new LinkedHashMap<>(); // year -> html lines
final List yearsToFetchPublicationsHtmlFor = new ArrayList<>();
try {
// start by fetching the oldest one from 2011, then extract the years to fetch from the elements
/*
2023
*/
log.info("Fetching CERT-EU publications for initial year 2011");
final List yearlyPublicationsPage2011 = getYearlyPublicationsPage(2011);
yearlyPublicationHtmlPages.put(2011, yearlyPublicationsPage2011);
for (String line : yearlyPublicationsPage2011) {
final Matcher yearMatcher = PATTERN_YEARLY_PUBLICATIONS_YEAR_EXTRACTION.matcher(line);
if (yearMatcher.matches()) {
final String year = yearMatcher.group(1);
if (!year.matches("\\d+")) {
// there is a line that does not contain a year as it redirects to the main page
log.warn("Failed to parse year from line: [{}]", line);
continue;
}
final int yearInt = Integer.parseInt(year);
yearsToFetchPublicationsHtmlFor.add(yearInt);
}
}
} catch (IOException e) {
LOG.error("Failed to fetch the oldest year from CERT-EU publications", e);
}
log.info("Fetching found CERT-EU publications for years {}", yearsToFetchPublicationsHtmlFor);
// fetch the rest of the years
for (int year : yearsToFetchPublicationsHtmlFor) {
try {
final List yearlyPublicationsPage = getYearlyPublicationsPage(year);
yearlyPublicationHtmlPages.put(year, yearlyPublicationsPage);
} catch (IOException e) {
LOG.error("Failed to fetch CERT-EU publications for year " + year, e);
}
}
return yearlyPublicationHtmlPages;
}
private Map extractUpdatedEntryIdsForYearlyPublicationsHtml(int year, List htmlLines) throws IOException {
/*
© 2015 - 2025 Weber Informatics LLC | Privacy Policy