com.metaeffekt.mirror.download.nvd.NvdCpeApiDownload Maven / Gradle / Ivy
/*
* Copyright 2021-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metaeffekt.mirror.download.nvd;
import com.metaeffekt.artifact.analysis.utils.FileUtils;
import com.metaeffekt.artifact.analysis.utils.TimeUtils;
import com.metaeffekt.mirror.download.documentation.MirrorMetadata;
import com.metaeffekt.mirror.Retry;
import com.metaeffekt.mirror.download.Download;
import com.metaeffekt.mirror.download.ResourceLocation;
import org.apache.commons.lang3.ObjectUtils;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
/**
* See the NVD CVE download for more details on the general data format and references of the data source.
* The data feed specific to the CPE data is split into two parts: The CPE Dictionary and the CPE Matches with each their own endpoint:
*
* - dictionary: https://services.nvd.nist.gov/rest/json/cpes/2.0
* allows 10000 results per request: ~1000000 CPE entries --> ~100 requests
* The dictionary contains a list of CPE that can be used to identify products. However, (most of) these CPEs do not contain any version information.
*
* - match: https://services.nvd.nist.gov/rest/json/cpematch/2.0
* allows 5000 results per request: ~450000 CPE entries --> ~90 requests
* The CPE match contains the versions that are missing from the dictionary. It contains almost no new CPEs, but the version information is added to the previously found CPEs.
*
*
* Since our version matching algorithm does not rely solely on the versions provided by the CPE entries,
* the relations between the different entries are not relevant in our context and the hierarchical structure from dict/match is flattened,
* normalized and stored in a single data structure.
* The relevant keys that are stored in the local files for each CPE are cpeName
, cpeNameId
, lastModified
, created
and
* deprecated
. Additionally, they can have titles
and refs
for several titles in different languages and references
* with a title and a link.
*/
@MirrorMetadata(directoryName = "cpe-dict", mavenPropertyName = "nvdCpeDownload")
public class NvdCpeApiDownload extends Download {
private final static Logger LOG = LoggerFactory.getLogger(NvdCpeApiDownload.class);
private static final SimpleDateFormat ISO_8601_DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
/**
* The NVD API limits authorized requests that provide an API Key to 50 per rolling 30 seconds window.
* 20ms are added to ensure the limit is not reached by unfortunate timings.
*/
private static final int API_DELAY_BETWEEN_AUTHORIZED_REQUESTS = (30 * 1000 / 50) + 20;
/**
* The NVD API limits unauthorized requests to 5 per rolling 30 seconds window.
* 400ms are added to ensure the limit is not reached by unfortunate timings.
*/
private static final int API_DELAY_BETWEEN_UNAUTHORIZED_REQUESTS = 30 * 1000 / 5 + 400;
private String apiKey;
private final List apiResponseDataToBeProcessed = Collections.synchronizedList(new ArrayList<>());
public NvdCpeApiDownload(File baseMirrorDirectory) {
super(baseMirrorDirectory, NvdCpeApiDownload.class);
}
public NvdCpeApiDownload setApiKey(String apiKey) {
this.apiKey = apiKey;
return this;
}
/**
* Based on NVD API User Workflow
*/
@Override
protected void performDownload() {
final boolean fullMirrorRequired = isFullMirrorRequired();
if (fullMirrorRequired) {
LOG.info("Downloading initial NVD data from NVD API");
} else {
LOG.info("Existing mirror detected. Downloading incremental NVD data from NVD API");
}
final long baseSleepDuration = apiKey == null ? API_DELAY_BETWEEN_UNAUTHORIZED_REQUESTS : API_DELAY_BETWEEN_AUTHORIZED_REQUESTS;
super.executor.setSize(4);
super.executor.setDelay(baseSleepDuration);
LOG.info("Requests {} be authorized with an API key, delay between requests [{}]",
apiKey == null ? "will not" : "will", TimeUtils.formatTimeDiff(baseSleepDuration));
final long lastModified = getDownloadDirectoryLastModified();
final Date lastModifiedDate = new Date(lastModified);
final Date now = new Date(TimeUtils.utcNow());
downloadCpeDictionaryApiData(fullMirrorRequired, lastModifiedDate, now);
downloadCpeMatchApiData(fullMirrorRequired, lastModifiedDate, now);
}
private void downloadCpeDictionaryApiData(boolean fullMirrorRequired, Date lastModifiedDate, Date now) {
LOG.info("Downloading NVD CPE Dictionary API data...");
downloadCpeApiDataFromSource(fullMirrorRequired, lastModifiedDate, now, ResourceLocationNvd.CPE_API_LIST_ALL, ResourceLocationNvd.CPE_API_START_END_DATE);
LOG.info("Finished processing NVD CPE Dictionary API data");
}
private void downloadCpeMatchApiData(boolean fullMirrorRequired, Date lastModifiedDate, Date now) {
LOG.info("Downloading NVD CPE Match API data...");
downloadCpeApiDataFromSource(fullMirrorRequired, lastModifiedDate, now, ResourceLocationNvd.CPE_MATCH_API_LIST_ALL, ResourceLocationNvd.CPE_MATCH_API_START_END_DATE);
LOG.info("Finished processing NVD CPE Match API data");
}
private void downloadCpeApiDataFromSource(boolean fullMirrorRequired, Date lastModifiedDate, Date now, ResourceLocationNvd cpeApiListAll, ResourceLocationNvd cpeApiStartEndDate) {
createDownloadThreads(fullMirrorRequired, lastModifiedDate, now, cpeApiListAll, cpeApiStartEndDate);
// check at least 5 times if the executor is still running
for (int i = 0; i < 5; i++) {
processResponseDataUntilDone();
}
}
private void processResponseDataUntilDone() {
super.executor.start();
do {
final List copy;
synchronized (apiResponseDataToBeProcessed) {
if (apiResponseDataToBeProcessed.size() >= 4) {
copy = new ArrayList<>(apiResponseDataToBeProcessed);
apiResponseDataToBeProcessed.clear();
} else {
copy = new ArrayList<>();
}
}
if (!copy.isEmpty()) {
this.processApiCpeItems(copy);
}
try {
Thread.sleep(1000);
} catch (InterruptedException ignored) {
}
} while (super.executor.isRunning());
final List copy;
synchronized (apiResponseDataToBeProcessed) {
if (!apiResponseDataToBeProcessed.isEmpty()) {
copy = new ArrayList<>(apiResponseDataToBeProcessed);
apiResponseDataToBeProcessed.clear();
} else {
copy = new ArrayList<>();
}
}
if (!copy.isEmpty()) {
this.processApiCpeItems(copy);
}
}
private void createDownloadThreads(boolean fullMirrorRequired, Date lastModifiedDate, Date now, ResourceLocationNvd locationAll, ResourceLocationNvd locationStartEndDate) {
super.executor.submit(() -> {
final JSONObject json = downloadCpeDecideWhatTimeFrame(fullMirrorRequired, lastModifiedDate, now, 0, locationAll, locationStartEndDate);
final int totalResults = json.getInt("totalResults");
final int resultsPerPage = json.getInt("resultsPerPage");
int currentStartIndex = 0;
if (totalResults == 0) {
LOG.info("No CPEs found for the given range.");
return;
}
LOG.info("Downloaded CPEs [{}] to [{}] of [{}]", currentStartIndex, currentStartIndex + resultsPerPage, totalResults);
appendJsonToProcessResponseCache(json);
while (currentStartIndex < totalResults) {
currentStartIndex += resultsPerPage;
final int finalCurrentStartIndex = currentStartIndex;
super.executor.submit(() -> {
final JSONObject subJson = downloadCpeDecideWhatTimeFrame(fullMirrorRequired, lastModifiedDate, now, finalCurrentStartIndex, locationAll, locationStartEndDate);
final int subTotalResults = subJson.getInt("totalResults");
final int subResultsPerPage = subJson.getInt("resultsPerPage");
LOG.info("Downloaded CPEs [{}] to [{}] of [{}]", finalCurrentStartIndex, finalCurrentStartIndex + subResultsPerPage, subTotalResults);
appendJsonToProcessResponseCache(subJson);
});
}
});
}
private void appendJsonToProcessResponseCache(JSONObject json) {
final JSONArray arr = ObjectUtils.firstNonNull(json.optJSONArray("products"), json.optJSONArray("matchStrings"));
if (arr == null) {
LOG.error("Unable to find 'products' or 'matchStrings' in the JSON response: {}", json.keySet());
} else {
synchronized (apiResponseDataToBeProcessed) {
apiResponseDataToBeProcessed.add(arr);
}
}
}
private JSONObject downloadCpeDecideWhatTimeFrame(boolean fullMirrorRequired, Date lastModifiedDate, Date now, int offset, ResourceLocationNvd locationAll, ResourceLocationNvd locationStartEndDate) {
final AtomicReference json = new AtomicReference<>();
new Retry(() -> {
if (fullMirrorRequired) {
json.set(downloadCpePage(offset, locationAll));
} else {
json.set(downloadCpePage(offset, lastModifiedDate, now, locationStartEndDate));
}
})
.withDelay((int) (API_DELAY_BETWEEN_UNAUTHORIZED_REQUESTS * 1.5d)) // fallback to the unauthorized API access delay in case of fail
.onException(Exception.class)
.retryCount(8)
.run();
return json.get();
}
private void processApiCpeItems(Collection cpesArrays) {
final List convertedProducts = new ArrayList<>();
for (JSONArray products : cpesArrays) {
final JSONArray convertedProductArray = new JSONArray();
for (int i = 0; i < products.length(); i++) {
final JSONObject product = products.getJSONObject(i);
final JSONObject unwrapped = unwrapCpeEntry(product);
final JSONObject converted = convertCpeMatchToCpeDictItem(unwrapped);
convertedProductArray.put(converted);
}
convertedProducts.add(convertedProductArray);
}
final Map yearCves = new HashMap<>();
for (JSONArray cpesArray : convertedProducts) {
final Map byYear = sortCpesIntoYears(cpesArray);
for (Map.Entry yearEntry : byYear.entrySet()) {
final JSONArray appendArray = yearCves.computeIfAbsent(yearEntry.getKey(), k -> new JSONArray());
for (int i = 0; i < yearEntry.getValue().length(); i++) {
appendArray.put(yearEntry.getValue().getJSONObject(i));
}
}
}
final int sizeBefore = cpesArrays.stream().mapToInt(JSONArray::length).sum();
final int sizeAfter = yearCves.values().stream().mapToInt(JSONArray::length).sum();
if (sizeBefore != sizeAfter) {
LOG.error("Dropped at least one CPE whilst sorting CPEs into yearly files: [{}] -> [{}]", sizeBefore, sizeAfter);
}
processApiCpeItems(yearCves);
}
private void processApiCpeItems(Map yearCves) {
if (yearCves.size() == 0) {
LOG.warn("No CPEs to process from the API.");
return;
}
LOG.info("Processing CPE data from years: [{}]", yearCves.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.comparing(JSONArray::length).reversed())).map(e -> e.getKey() + " = " + e.getValue().length()).collect(Collectors.joining("; ")));
for (Map.Entry entry : yearCves.entrySet()) {
final int year = entry.getKey();
final JSONArray cpes = entry.getValue();
final JSONArray existingJson = parseCpeItemsFromDownloadedYear(year);
final JSONArray mergedJson = mergeCpeItems(Arrays.asList(cpes, existingJson));
LOG.info("Year: [{}], merging existing with downloaded [{} + {} --> {}]", year, existingJson.length(), cpes.length(), mergedJson.length());
final File cveFile = new File(super.downloadIntoDirectory, year + ".json");
try {
FileUtils.write(cveFile, mergedJson.toString(), StandardCharsets.UTF_8);
} catch (IOException e) {
throw new RuntimeException("Unable to write NVD CPE year file " + cveFile.getAbsolutePath(), e);
}
}
}
private JSONObject convertCpeMatchToCpeDictItem(JSONObject cpe) {
if (!cpe.has("criteria") || !cpe.has("matchCriteriaId") || !cpe.has("lastModified") || !cpe.has("created")) {
return cpe;
}
return new JSONObject()
.put("deprecated", false)
.put("cpeName", cpe.getString("criteria"))
.put("cpeNameId", cpe.getString("matchCriteriaId"))
.put("lastModified", cpe.getString("lastModified"))
.put("created", cpe.getString("created"));
// TODO: validate that the following are not needed:
// this should not be necessary, as the matches IDs are guaranteed to be present in the CPE Dictionary API:
//.put("matches", cpe.optJSONArray("matches"));
}
private static JSONObject unwrapCpeEntry(JSONObject cpeMatch) {
if (cpeMatch.has("cpe")) {
return cpeMatch.getJSONObject("cpe");
} else if (cpeMatch.has("matchString")) {
return cpeMatch.getJSONObject("matchString");
}
return cpeMatch;
}
private JSONArray parseCpeItemsFromDownloadedYear(int year) {
final File cveFile = new File(super.downloadIntoDirectory, year + ".json");
if (!cveFile.exists()) {
return new JSONArray();
}
try {
final String content = FileUtils.readFileToString(cveFile, StandardCharsets.UTF_8);
return new JSONArray(content);
} catch (IOException e) {
throw new RuntimeException("Unable to read NVD CPE year file " + cveFile.getAbsolutePath(), e);
}
}
private Map sortCpesIntoYears(JSONArray cpesArray) {
final Map yearCves = new HashMap<>();
if (cpesArray == null) {
LOG.warn("No CPEs to process from the API while sorting CPEs into years.");
return yearCves;
}
for (int i = 0; i < cpesArray.length(); i++) {
final JSONObject cpe = cpesArray.getJSONObject(i);
if (!cpe.has("created")) {
throw new RuntimeException("CPE entry does not provide 'created' timestamp: " + cpe);
}
final String createdTimestamp = cpe.getString("created"); // e.q. 2007-08-23T21:05:57.937
final int year = Integer.parseInt(createdTimestamp.substring(0, 4));
if (year < 1999 || year > 9999) {
LOG.warn("CPE entry most likely has invalid year [{}]", cpe);
}
yearCves.computeIfAbsent(year, k -> new JSONArray())
.put(cpe);
}
final int countBefore = cpesArray.length();
final int countAfter = yearCves.values().stream().mapToInt(JSONArray::length).sum();
if (countBefore != countAfter) {
LOG.warn("CPEs were lost during sorting into years: [{} --> {}]", countBefore, countAfter);
}
return yearCves;
}
private JSONArray mergeCpeItems(List cpes) {
final JSONArray mergedJson = new JSONArray();
final Set knownIds = new HashSet<>();
for (JSONArray array : cpes) {
for (int i = 0; i < array.length(); i++) {
final JSONObject cve = unwrapCpeEntry(array.getJSONObject(i));
final String id = ObjectUtils.firstNonNull(cve.optString("cpeNameId", null), cve.optString("matchCriteriaId", null));
if (knownIds.add(id)) {
mergedJson.put(cve);
} else if (id == null) {
LOG.warn("CPE entry does not provide 'cpeNameId' or 'matchCriteriaId' - skipping: [{}]", cve);
} else {
LOG.debug("CPE entry with ID [{}] already exists - skipping: [{}]", id, cve);
}
}
}
return mergedJson;
}
private JSONObject downloadCpePage(int offset, ResourceLocationNvd baseLocation) {
final URL pageUrl = super.getRemoteResourceLocationUrl(baseLocation, offset);
final List response = super.downloader.fetchResponseBodyFromUrlAsList(pageUrl, Collections.singletonMap("apiKey", apiKey));
try {
return new JSONObject(String.join("", response));
} catch (JSONException e) {
throw new RuntimeException("Unable to parse NVD CPE API response: " + response + "\nRequest URL: " + pageUrl, e);
}
}
private JSONObject downloadCpePage(int offset, Date lastModStartDate, Date lastModEndDate, ResourceLocationNvd baseLocation) {
final long diff = lastModEndDate.getTime() - lastModStartDate.getTime();
if (diff > 120L * 24 * 60 * 60 * 1000) {
throw new IllegalArgumentException("Difference between lastModStartDate and lastModEndDate must not be greater than 120 days");
}
final String startDate = ISO_8601_DATE_FORMAT.format(lastModStartDate);
final String endDate = ISO_8601_DATE_FORMAT.format(lastModEndDate);
final URL pageUrl = super.getRemoteResourceLocationUrl(baseLocation, offset, startDate, endDate);
final List response = super.downloader.fetchResponseBodyFromUrlAsList(pageUrl, Collections.singletonMap("apiKey", apiKey));
try {
return new JSONObject(String.join("", response));
} catch (JSONException e) {
throw new RuntimeException("Unable to parse NVD CVE API response: " + response + "\nRequest URL: " + pageUrl, e);
}
}
private boolean isFullMirrorRequired() {
final File[] downloadFiles = super.downloadIntoDirectory.listFiles();
if (downloadFiles == null) {
LOG.info("No CPE JSON files found in download directory, performing full mirror");
return true;
}
final List files = Arrays.stream(downloadFiles).map(File::getName).collect(Collectors.toList());
if (files.stream().noneMatch(file -> file.endsWith(".json"))) {
LOG.info("No CPE JSON files found in download directory, performing full mirror");
return true;
}
final int latestCheckYear = Calendar.getInstance().get(Calendar.YEAR) - 1;
for (int year = 2007; year <= latestCheckYear; year++) {
if (!files.contains(year + ".json")) {
LOG.info("Missing CPE JSON file for year [{}], performing full mirror", year);
return true;
}
}
final long days120 = 120L * 24 * 60 * 60 * 1000;
final long directoryLastModified = getDownloadDirectoryLastModified();
if (super.isUpdatedAgeOlderThan(directoryLastModified, days120)) {
LOG.info("Download directory last modified date is older than 120 days, performing full mirror");
return true;
}
return false;
}
@Override
protected boolean additionalIsDownloadRequired() {
if (isFullMirrorRequired()) {
return true;
}
final long lastModified = getDownloadDirectoryLastModified();
final Date lastModifiedDate = new Date(lastModified);
final Date now = new Date(TimeUtils.utcNow());
final JSONObject changesMatch = downloadCpePage(0, lastModifiedDate, now, ResourceLocationNvd.CPE_MATCH_API_START_END_DATE);
if (changesMatch.has("totalResults") && changesMatch.getInt("totalResults") > 0) {
LOG.info("NVD CPE Match API reports [{}] new/changed entries since last download", changesMatch.getInt("totalResults"));
return true;
}
final JSONObject changesDict = downloadCpePage(0, lastModifiedDate, now, ResourceLocationNvd.CPE_API_START_END_DATE);
if (changesDict.has("totalResults") && changesDict.getInt("totalResults") > 0) {
LOG.info("NVD CPE Dictionary API reports [{}] new/changed entries since last download", changesDict.getInt("totalResults"));
return true;
}
return false;
}
@Override
public void setRemoteResourceLocation(String location, String url) {
super.setRemoteResourceLocation(ResourceLocationNvd.valueOf(location), url);
}
public enum ResourceLocationNvd implements ResourceLocation {
/**
*
* startIndex
0-based index of the first CPE to be returned in the response data
*
*/
CPE_API_LIST_ALL("https://services.nvd.nist.gov/rest/json/cpes/2.0?startIndex=%d"),
/**
* The maximum allowable range when using any date range parameters is 120 consecutive days.
* Values must be entered in the extended ISO-8061 date/time format:
* [YYYY][“-”][MM][“-”][DD][“T”][HH][“:”][MM][“:”][SS][Z]
*
* startIndex
0-based index of the first CPE to be returned in the response data
* lastModStartDate
the start date
* lastModEndDate
the end date
*
*/
CPE_API_START_END_DATE("https://services.nvd.nist.gov/rest/json/cpes/2.0?startIndex=%d&lastModStartDate=%s&lastModEndDate=%s"),
/**
*
* startIndex
0-based index of the first CPE to be returned in the response data
*
*/
CPE_MATCH_API_LIST_ALL("https://services.nvd.nist.gov/rest/json/cpematch/2.0?startIndex=%d"),
/**
* The maximum allowable range when using any date range parameters is 120 consecutive days.
* Values must be entered in the extended ISO-8061 date/time format:
* [YYYY][“-”][MM][“-”][DD][“T”][HH][“:”][MM][“:”][SS][Z]
*
* startIndex
0-based index of the first CPE to be returned in the response data
* lastModStartDate
the start date
* lastModEndDate
the end date
*
*/
CPE_MATCH_API_START_END_DATE("https://services.nvd.nist.gov/rest/json/cpematch/2.0?startIndex=%d&lastModStartDate=%s&lastModEndDate=%s");
private final String defaultValue;
ResourceLocationNvd(String defaultValue) {
this.defaultValue = defaultValue;
}
@Override
public String getDefault() {
return this.defaultValue;
}
}
}