All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.metaeffekt.mirror.download.advisor.CertEuDownload Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2021-2024 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.metaeffekt.mirror.download.advisor;

import com.metaeffekt.artifact.analysis.utils.FileUtils;
import com.metaeffekt.artifact.analysis.utils.StringUtils;
import com.metaeffekt.mirror.download.documentation.MirrorMetadata;
import com.metaeffekt.mirror.Retry;
import com.metaeffekt.mirror.download.Download;
import com.metaeffekt.mirror.download.ResourceLocation;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 

References:

* *

CERT-EU provides a list of security advisories that are published on their website. * The advisories are published in a structured manner, with each advisory being contained in a separate JSON file. * The downloader will fetch the list of advisories from the CERT-EU website and download the JSON files for each advisory. * The advisories are structured in a yearly manner, with each year containing a list of advisories published in that year.

*/ @Slf4j @MirrorMetadata(directoryName = "certeu", mavenPropertyName = "certEuDownload") public class CertEuDownload extends Download { private final static Logger LOG = LoggerFactory.getLogger(CertEuDownload.class); private final static Pattern PATTERN_YEARLY_PUBLICATIONS_YEAR_EXTRACTION = Pattern.compile(".*publications/security-advisories/(\\d{4,5})\".*"); private final static Pattern PATTERN_YEARLY_PUBLICATIONS_ENTRY_ID_EXTRACTION = Pattern.compile("/publications/security-advisories/(\\d{4,5}-\\d{3,5})/"); private final File yearlyPublicationsDirectory = new File(super.getDownloadIntoDirectory(), "publications-summary"); private final File publicationEntries = new File(super.getDownloadIntoDirectory(), "publications"); private final File previousPublicationDatesFile = new File(this.yearlyPublicationsDirectory, "publication-dates.json"); public CertEuDownload(File baseMirrorDirectory) { super(baseMirrorDirectory, CertEuDownload.class); } @Override protected void performDownload() { final Map> yearlyPublicationHtmlPages = getAllYearlyPublications(); final Map> entriesToBeFetchedPerYear = new LinkedHashMap<>(); for (Map.Entry> yearPage : yearlyPublicationHtmlPages.entrySet()) { try { final Map entriesToBeFetched = extractUpdatedEntryIdsForYearlyPublicationsHtml(yearPage.getKey(), yearPage.getValue()); if (!entriesToBeFetched.isEmpty()) { entriesToBeFetchedPerYear.put(yearPage.getKey(), entriesToBeFetched); } else { log.info("Year [{}] is already complete and up to date, no need to fetch.", yearPage.getKey()); } } catch (IOException e) { throw new RuntimeException("Failed to extract the yearly publication entries for CERT-EU on year " + yearPage.getKey(), e); } } if (entriesToBeFetchedPerYear.isEmpty()) { log.info("No new entries to fetch, skipping download."); return; } for (Map.Entry> yearEntries : entriesToBeFetchedPerYear.entrySet()) { final Map entriesToBeFetched = yearEntries.getValue(); final int year = yearEntries.getKey(); log.info("Starting fetching process for year [{}] with [{}] entries to be fetched.", year, entriesToBeFetched.size()); for (Map.Entry entry : entriesToBeFetched.entrySet()) { final String entryId = entry.getKey(); downloadEntry(year, entryId); } try { this.mergePreviouslyParsedEntryPublicationDatesBackIntoFile(entriesToBeFetched); } catch (IOException e) { log.error("Failed to merge the fetched entries back into the publication dates file, this means that next time, all entries for year [{}] will be fetched again. This error should not occur, as it is a simple file read/write access.", year, e); } } } @Override protected boolean additionalIsDownloadRequired() { final String localRssHash = getLocalRssHash(); final String onlineRssHash = getOnlineRssHash(); if (localRssHash.equals(onlineRssHash)) { return false; } else { log.info("RSS Feed changed since last execution, mirror is required"); super.propertyFiles.set(super.getDownloadIntoDirectory(), "info", InfoFileAttributes.CERT_EU_PREFIX.getKey() + "rss-feed-sha256", onlineRssHash); return true; } } private String getOnlineRssHash() { final URL url = getRemoteResourceLocationUrl(ResourceLocationCertEu.RSS_FEED); final String rssContent = String.join("", super.downloader.fetchResponseBodyFromUrlAsList(url)); return DigestUtils.sha256Hex(rssContent); } private String getLocalRssHash() { return super.propertyFiles.getString(super.getDownloadIntoDirectory(), "info", InfoFileAttributes.CERT_EU_PREFIX.getKey() + "rss-feed-sha256") .orElse(""); } private Map> getAllYearlyPublications() { final Map> yearlyPublicationHtmlPages = new LinkedHashMap<>(); // year -> html lines final List yearsToFetchPublicationsHtmlFor = new ArrayList<>(); try { // start by fetching the oldest one from 2011, then extract the years to fetch from the elements /*
  • 2023
  • */ log.info("Fetching CERT-EU publications for initial year 2011"); final List yearlyPublicationsPage2011 = getYearlyPublicationsPage(2011); yearlyPublicationHtmlPages.put(2011, yearlyPublicationsPage2011); for (String line : yearlyPublicationsPage2011) { final Matcher yearMatcher = PATTERN_YEARLY_PUBLICATIONS_YEAR_EXTRACTION.matcher(line); if (yearMatcher.matches()) { final String year = yearMatcher.group(1); if (!year.matches("\\d+")) { // there is a line that does not contain a year as it redirects to the main page log.warn("Failed to parse year from line: [{}]", line); continue; } final int yearInt = Integer.parseInt(year); yearsToFetchPublicationsHtmlFor.add(yearInt); } } } catch (IOException e) { LOG.error("Failed to fetch the oldest year from CERT-EU publications", e); } log.info("Fetching found CERT-EU publications for years {}", yearsToFetchPublicationsHtmlFor); // fetch the rest of the years for (int year : yearsToFetchPublicationsHtmlFor) { try { final List yearlyPublicationsPage = getYearlyPublicationsPage(year); yearlyPublicationHtmlPages.put(year, yearlyPublicationsPage); } catch (IOException e) { LOG.error("Failed to fetch CERT-EU publications for year " + year, e); } } return yearlyPublicationHtmlPages; } private Map extractUpdatedEntryIdsForYearlyPublicationsHtml(int year, List htmlLines) throws IOException { /*




    © 2015 - 2025 Weber Informatics LLC | Privacy Policy