com.metaeffekt.artifact.analysis.metascan.CopyrightSegmentationSupport Maven / Gradle / Ivy
/*
* Copyright 2021-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metaeffekt.artifact.analysis.metascan;
import com.metaeffekt.artifact.analysis.model.PropertyProvider;
import com.metaeffekt.artifact.analysis.scancode.ScanCodeParser;
import com.metaeffekt.artifact.analysis.utils.FileUtils;
import com.metaeffekt.artifact.analysis.utils.InventoryUtils;
import com.metaeffekt.artifact.analysis.utils.SegmentationUtils;
import com.metaeffekt.artifact.analysis.utils.StringUtils;
import com.metaeffekt.artifact.terms.model.MergedSegmentResult;
import com.metaeffekt.artifact.terms.model.NormalizationMetaData;
import com.metaeffekt.artifact.terms.model.TermsMetaData;
import com.metaeffekt.artifact.terms.model.Variables;
import org.json.JSONException;
import org.json.JSONObject;
import org.metaeffekt.core.inventory.processor.model.Artifact;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.*;
public class CopyrightSegmentationSupport extends AbstractScanSupport {
public CopyrightSegmentationSupport(NormalizationMetaData normalizationMetaData, PropertyProvider propertyProvider) {
super(normalizationMetaData, propertyProvider);
}
public boolean runSegmentation(Artifact artifact, File sourceDir) throws IOException, JSONException {
final File analysisDir = sourceDir.getParentFile();
final String sourceFolderName = sourceDir.getName();
final File targetFolder = new File(analysisDir, sourceFolderName + "-analysis");
final File intermediateFolder = new File(analysisDir, sourceFolderName + "-intermediate");
final String filename = artifact.getId().replace("/", "_");
final File metascanResultFile = new File(targetFolder, filename + "_metascan.json");
final File scancodeResultFile = new File(targetFolder, filename + "_scancode.json");
final File resultLogFile = new File(targetFolder, filename + "_copyright-segmentation.txt");
final boolean enabled = getPropertyProvider().
isProperty("analyze.scan.copyright.segmentation.enabled", "true", "false");
if (!enabled) return false;
long metascanResultJsonTimestamp = metascanResultFile.lastModified();
long scancodeResultJsonTimestamp = scancodeResultFile.lastModified();
long resultFileTimestamp = resultLogFile.lastModified();
boolean outdatedResult =
metascanResultJsonTimestamp > resultFileTimestamp ||
scancodeResultJsonTimestamp > resultFileTimestamp;
final boolean overwrite = outdatedResult || getPropertyProvider().
isProperty("analyze.scan.copyright.segmentation.overwrite", "true", "false");
if (!overwrite && resultLogFile.exists()) {
return false;
}
init(resultLogFile, "Copyright Segmentation");
log(resultLogFile, "");
// validate all input folders available
FileUtils.validateExists(analysisDir);
FileUtils.validateExists(metascanResultFile);
FileUtils.validateExists(scancodeResultFile);
String scancodeInput = new JSONObject(FileUtils.readFileToString(scancodeResultFile, StandardCharsets.UTF_8)).optJSONArray("headers").optJSONObject(0).optJSONObject("options").optJSONArray("input").optString(0);
final ScanCodeParser scanCodeParser = new ScanCodeParser();
final Map> licenseFileMap = new HashMap<>();
final Map> copyrightFileMap = new HashMap<>();
final Map> authorFileMap = new HashMap<>();
final Map> metaLicenseFileMap = new SegmentationUtils().getLicenseFileMap(metascanResultFile, false);
scanCodeParser.parseScanCodeResult(FileUtils.readFileToString(scancodeResultFile, FileUtils.ENCODING_UTF_8), licenseFileMap, copyrightFileMap, authorFileMap);
Map> fileCopyrightMap = invertMap(copyrightFileMap);
for (String license : metaLicenseFileMap.keySet()) {
final List files = metaLicenseFileMap.get(license);
boolean loggedLicense = false;
// collect copyrights that are covered by the files
final Set allCoveredCopyrights = new HashSet<>();
for (final String file : files) {
final String scancodePath = sourceDir.getName() + "-intermediate" + file + ".txt";
final List copyrights = fileCopyrightMap.get(scancodePath);
if (copyrights != null) {
allCoveredCopyrights.addAll(copyrights);
}
}
final Map> copyrightCondensedCopyrightMap = new HashMap<>();
for (String copyright : allCoveredCopyrights) {
String condensed = copyright;
condensed = condensed.replaceAll("[0-9]*", "");
condensed = condensed.replaceAll("-", "");
condensed = condensed.replaceAll(",", "");
condensed = condensed.replace("Copyrighted ", "");
condensed = condensed.replace("copyrighted ", "");
condensed = condensed.replace("Copyright ", "");
condensed = condensed.replace("copyright ", "");
condensed = condensed.replace("(c)", "");
condensed = condensed.replace("(C)", "");
condensed = condensed.replace("©", "");
condensed = condensed.trim();
copyrightCondensedCopyrightMap.computeIfAbsent(condensed, k -> new ArrayList<>()).add(copyright);
}
for (String copyright : copyrightCondensedCopyrightMap.keySet()) {
if (!loggedLicense) {
log(resultLogFile, ">>> " + license + " <<<");
loggedLicense = true;
}
log(resultLogFile, " >> " + copyright + " <<");
List condensedList = copyrightCondensedCopyrightMap.get(copyright);
for (String cr : condensedList) {
log(resultLogFile, " > " + cr);
List filesCovered = copyrightFileMap.get(cr);
for (String file : filesCovered) {
String path = FileUtils.asRelativePath(intermediateFolder, new File(analysisDir, file));
log(resultLogFile, " " + scancodeInput + "/" + path);
}
log(resultLogFile, "");
}
}
}
matchScancodeResultToMetascanResult(artifact, sourceDir, scancodeInput);
return true;
}
private void matchScancodeResultToMetascanResult(Artifact artifact, File sourceDir, String scanCodeInput) throws IOException {
// input
final File targetFolder = new File(sourceDir.getParentFile(), sourceDir.getName() + "-analysis");
// output
final String filename = artifact.getId().replace("/", "_");
final File copyrightLicenseAssignmentFile = new File(targetFolder, filename + "_copyright-license-assignment.txt");
final File variableExtractionFile = new File(targetFolder, filename + "_license-variable-extractions.txt");
final File scancodeMissmatchFolder = new File(targetFolder, filename + "-scancode-missmatch-issues");
if (scancodeMissmatchFolder.exists()) FileUtils.deleteDir(scancodeMissmatchFolder);
MergedScanResult mergedResult = new MergedScanResult();
mergedResult.mergeResults(artifact, sourceDir);
copyrightLicenseAssignmentFile(mergedResult, copyrightLicenseAssignmentFile, scanCodeInput);
variableExtractionFile(mergedResult, variableExtractionFile, scanCodeInput);
metaScanScanCodeMissmatch(mergedResult, scancodeMissmatchFolder);
}
private void copyrightLicenseAssignmentFile(MergedScanResult mergedResult, File copyrightLicenseAssignmentFile, String scancodeInput) throws IOException {
init(copyrightLicenseAssignmentFile, "Copyright/License Segmentation");
log(copyrightLicenseAssignmentFile, "");
HashMap> groupedResult = new HashMap<>();
List mergedScanResult = mergedResult.getMergedScanResult();
if (mergedScanResult != null) {
for (MergedSegmentResult mergedSegmentResult : mergedScanResult) {
String licenseResult = InventoryUtils.joinLicenses(mergedSegmentResult.getResolvedLicenses());
String scancodeLicensesResult = InventoryUtils.joinLicenses(mergedSegmentResult.getScancodeLicenses());
String copyrightsResults = InventoryUtils.joinLicenses(mergedSegmentResult.getCopyrights());
if (licenseResult != null || scancodeLicensesResult != null || copyrightsResults != null) {
JSONObject json = new JSONObject();
json.put("licenseResult", StringUtils.notNull(licenseResult));
json.put("scancodeLicenseResult", StringUtils.notNull(scancodeLicensesResult));
json.put("copyrightResult", StringUtils.notNull(copyrightsResults));
final String key = json.toString();
if (groupedResult.get(key) != null) {
groupedResult.get(key).add(scancodeInput + "/" + mergedSegmentResult.getPath());
} else {
List list = new ArrayList<>();
list.add(scancodeInput + "/" + mergedSegmentResult.getPath());
groupedResult.put(key, list);
}
}
}
// FIXME: differentiate markers, filtered licenses, ignored licenses
for (String result : groupedResult.keySet()) {
final JSONObject json = new JSONObject(result);
log(copyrightLicenseAssignmentFile, "-----");
log(copyrightLicenseAssignmentFile, "");
log(copyrightLicenseAssignmentFile, " resolvedLicenses: " + json.getString("licenseResult"));
log(copyrightLicenseAssignmentFile, " scancodeLicenses: " + json.getString("scancodeLicenseResult"));
log(copyrightLicenseAssignmentFile, " scancodeCopyrights: " + json.getString("copyrightResult"));
log(copyrightLicenseAssignmentFile, "");
log(copyrightLicenseAssignmentFile, "These results were found in the following files:");
for (String file : groupedResult.get(result)) {
log(copyrightLicenseAssignmentFile, " >" + file + "<");
}
log(copyrightLicenseAssignmentFile, "");
}
}
}
// FIXME: add documentation
private void variableExtractionFile(MergedScanResult mergedResult, File variableExtractionFile, String scancodeInput) throws IOException {
HashMap> variableFiles = new HashMap<>();
HashMap> variableCopyrights = new HashMap<>();
List mergedScanResult = mergedResult.getMergedScanResult();
if (mergedScanResult != null) {
for (MergedSegmentResult mergedSegmentResult : mergedResult.getMergedScanResult()) {
List variables = mergedSegmentResult.getVariables();
if (variables != null) {
for (Variables variable : variables) {
final String filepath = mergedSegmentResult.getPath();
if (filepath != null) {
final List copyrights = mergedSegmentResult.getCopyrights();
if (variable.listContains(new ArrayList<>(variableFiles.keySet()))) {
Variables key = variable.getFromList(new ArrayList<>(variableFiles.keySet()));
variableFiles.get(key).add(scancodeInput + "/" + filepath);
variableCopyrights.get(key).addAll(copyrights);
} else {
final List fileList = new ArrayList<>();
final List copyrightList = new ArrayList<>();
fileList.add(scancodeInput + "/" + filepath);
copyrightList.addAll(copyrights);
variableFiles.put(variable, fileList);
variableCopyrights.put(variable, copyrightList);
}
}
}
}
}
}
HashMap> licenseVariableList = new HashMap<>();
for (Variables variable : variableFiles.keySet()) {
String license = variable.getLicense();
if (!licenseVariableList.containsKey(license)) {
List variablesList = new ArrayList<>();
variablesList.add(variable);
licenseVariableList.put(license, variablesList);
} else {
licenseVariableList.get(license).add(variable);
}
}
init(variableExtractionFile, "Variable Extraction File");
for (String license : licenseVariableList.keySet()) {
log(variableExtractionFile, "-------------------");
log(variableExtractionFile, license + "\n");
if (licenseVariableList.get(license).size() > 1) {
log(variableExtractionFile, "The " + license + " has " + licenseVariableList.get(license).size() + " different variable sets.\n");
}
for (Variables variable : licenseVariableList.get(license)) {
log(variableExtractionFile, "------");
log(variableExtractionFile, "variables:");
for (String s : variable.getValues().keySet()) {
log(variableExtractionFile, " " + s + ": " + variable.getValues().get(s));
}
log(variableExtractionFile, "\nCopyrights associated with this license and variables:");
for (String s : variableCopyrights.get(variable)) {
log(variableExtractionFile, " - " + s);
}
log(variableExtractionFile, "\nThose variables were found in the following File/Segments:");
for (String s : variableFiles.get(variable)) {
log(variableExtractionFile, " - " + s);
}
log(variableExtractionFile, "\n");
}
}
}
private void metaScanScanCodeMissmatch(MergedScanResult mergedResult, File scancodeMissmatchFolder) throws IOException {
for (MergedSegmentResult mergedSegmentResult : mergedResult.getMergedScanResult()) {
List resolvedLicenses = mergedSegmentResult.getResolvedLicenses();
Map scancodeLicenseKeysValue = mergedSegmentResult.getScancodeLicenseKeys();
List scancodeLicenseKeys = new ArrayList<>(scancodeLicenseKeysValue.keySet());
if (scancodeLicenseKeys.isEmpty()) continue;
String filename = mergedSegmentResult.getPath().substring(mergedSegmentResult.getPath().indexOf("/"));
if (resolvedLicenses.size() == 0) {
File output = new File(scancodeMissmatchFolder, filename);
init(output, "MetaScan result empty, while ScanCode matches:");
for (String s : scancodeLicenseKeys) {
log(output, s);
}
} else {
final NormalizationMetaData normalizationMetaData = getNormalizationMetaData();
filterScancodeLicenseKeys(scancodeLicenseKeys);
final List existingKeys = new ArrayList<>();
for (String scancodeLicenseKey : scancodeLicenseKeys) {
final String key = "scancode:" + scancodeLicenseKey;
for (String resolvedLicense : resolvedLicenses) {
final TermsMetaData termsMetaData = normalizationMetaData.getTermsMetaData(resolvedLicense);
final List otherIds = termsMetaData != null ? termsMetaData.getOtherIds() : null;
if (otherIds != null) {
if (otherIds.contains(key)) {
existingKeys.add(scancodeLicenseKey);
}
} else {
if (termsMetaData != null && termsMetaData.getRepresentedAs() != null) {
List otherIdsRepresentedAs = normalizationMetaData.getTermsMetaData(termsMetaData.getRepresentedAs()).getOtherIds();
if (otherIdsRepresentedAs != null) {
if (otherIdsRepresentedAs.contains(key)) {
existingKeys.add(scancodeLicenseKey);
}
}
}
}
}
}
scancodeLicenseKeys.removeAll(existingKeys);
existingKeys.clear();
for (String scancodeLicenseKey : scancodeLicenseKeys) {
if (scancodeLicenseKeysValue.get(scancodeLicenseKey) < 90) {
existingKeys.add(scancodeLicenseKey);
}
}
scancodeLicenseKeys.removeAll(existingKeys);
if (scancodeLicenseKeys.size() > 0) {
File output = new File(scancodeMissmatchFolder, filename);
init(output, "Additional ScanCode matches detected:");
log(output, "MetaScan matches:" + org.apache.commons.lang3.StringUtils.join(resolvedLicenses, ", "));
log(output, "Additional matches by ScanCode:");
for (String s : scancodeLicenseKeys) {
log(output, s);
}
}
}
}
}
private List filterScancodeLicenseKeys(List scancodeLicenseKeys) {
String[] excludes = {"other-permissive", "proprietary-license", "public-domain", "unknown-license-reference", "unknown"};
for (String exclude : excludes) {
scancodeLicenseKeys.remove(exclude);
}
return scancodeLicenseKeys;
}
protected Map> invertMap(Map> keyListMap) {
final Map> invertedMap = new HashMap<>();
for (final Map.Entry> entry : keyListMap.entrySet()) {
for (final String path : entry.getValue()) {
List values = invertedMap.get(path);
if (values == null) {
values = new ArrayList<>();
invertedMap.put(path, values);
}
if (!values.contains(entry.getKey())) {
values.add(entry.getKey());
}
}
}
return invertedMap;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy