All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.metaeffekt.artifact.analysis.metascan.CopyrightSegmentationSupport Maven / Gradle / Ivy

There is a newer version: 0.132.0
Show newest version
/*
 * Copyright 2021-2024 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.metaeffekt.artifact.analysis.metascan;

import com.metaeffekt.artifact.analysis.model.PropertyProvider;
import com.metaeffekt.artifact.analysis.scancode.ScanCodeParser;
import com.metaeffekt.artifact.analysis.utils.FileUtils;
import com.metaeffekt.artifact.analysis.utils.InventoryUtils;
import com.metaeffekt.artifact.analysis.utils.SegmentationUtils;
import com.metaeffekt.artifact.analysis.utils.StringUtils;
import com.metaeffekt.artifact.terms.model.MergedSegmentResult;
import com.metaeffekt.artifact.terms.model.NormalizationMetaData;
import com.metaeffekt.artifact.terms.model.TermsMetaData;
import com.metaeffekt.artifact.terms.model.Variables;
import org.json.JSONException;
import org.json.JSONObject;
import org.metaeffekt.core.inventory.processor.model.Artifact;

import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.*;

public class CopyrightSegmentationSupport extends AbstractScanSupport {

    public CopyrightSegmentationSupport(NormalizationMetaData normalizationMetaData, PropertyProvider propertyProvider) {
        super(normalizationMetaData, propertyProvider);
    }

    public boolean runSegmentation(Artifact artifact, File sourceDir) throws IOException, JSONException {
        final File analysisDir = sourceDir.getParentFile();
        final String sourceFolderName = sourceDir.getName();

        final File targetFolder = new File(analysisDir, sourceFolderName + "-analysis");
        final File intermediateFolder = new File(analysisDir, sourceFolderName + "-intermediate");

        final String filename = artifact.getId().replace("/", "_");
        final File metascanResultFile = new File(targetFolder, filename + "_metascan.json");
        final File scancodeResultFile = new File(targetFolder, filename + "_scancode.json");

        final File resultLogFile = new File(targetFolder, filename + "_copyright-segmentation.txt");

        final boolean enabled = getPropertyProvider().
                isProperty("analyze.scan.copyright.segmentation.enabled", "true", "false");
        if (!enabled) return false;

        long metascanResultJsonTimestamp = metascanResultFile.lastModified();
        long scancodeResultJsonTimestamp = scancodeResultFile.lastModified();
        long resultFileTimestamp = resultLogFile.lastModified();

        boolean outdatedResult =
                metascanResultJsonTimestamp > resultFileTimestamp ||
                scancodeResultJsonTimestamp > resultFileTimestamp;

        final boolean overwrite = outdatedResult || getPropertyProvider().
                isProperty("analyze.scan.copyright.segmentation.overwrite", "true", "false");

        if (!overwrite && resultLogFile.exists()) {
            return false;
        }

        init(resultLogFile, "Copyright Segmentation");
        log(resultLogFile, "");

        // validate all input folders available
        FileUtils.validateExists(analysisDir);
        FileUtils.validateExists(metascanResultFile);
        FileUtils.validateExists(scancodeResultFile);

        String scancodeInput = new JSONObject(FileUtils.readFileToString(scancodeResultFile, StandardCharsets.UTF_8)).optJSONArray("headers").optJSONObject(0).optJSONObject("options").optJSONArray("input").optString(0);

        final ScanCodeParser scanCodeParser = new ScanCodeParser();

        final Map> licenseFileMap = new HashMap<>();
        final Map> copyrightFileMap = new HashMap<>();
        final Map> authorFileMap = new HashMap<>();

        final Map> metaLicenseFileMap = new SegmentationUtils().getLicenseFileMap(metascanResultFile, false);

        scanCodeParser.parseScanCodeResult(FileUtils.readFileToString(scancodeResultFile, FileUtils.ENCODING_UTF_8), licenseFileMap, copyrightFileMap, authorFileMap);

        Map> fileCopyrightMap = invertMap(copyrightFileMap);

        for (String license : metaLicenseFileMap.keySet()) {
            final List files = metaLicenseFileMap.get(license);

            boolean loggedLicense = false;

            // collect copyrights that are covered by the files
            final Set allCoveredCopyrights = new HashSet<>();
            for (final String file : files) {
                final String scancodePath = sourceDir.getName() + "-intermediate" + file + ".txt";
                final List copyrights = fileCopyrightMap.get(scancodePath);
                if (copyrights != null) {
                    allCoveredCopyrights.addAll(copyrights);
                }
            }

            final Map> copyrightCondensedCopyrightMap = new HashMap<>();
            for (String copyright : allCoveredCopyrights) {
                String condensed = copyright;
                condensed = condensed.replaceAll("[0-9]*", "");
                condensed = condensed.replaceAll("-", "");
                condensed = condensed.replaceAll(",", "");
                condensed = condensed.replace("Copyrighted ", "");
                condensed = condensed.replace("copyrighted ", "");
                condensed = condensed.replace("Copyright ", "");
                condensed = condensed.replace("copyright ", "");
                condensed = condensed.replace("(c)", "");
                condensed = condensed.replace("(C)", "");
                condensed = condensed.replace("©", "");
                condensed = condensed.trim();

                copyrightCondensedCopyrightMap.computeIfAbsent(condensed, k -> new ArrayList<>()).add(copyright);
            }

            for (String copyright : copyrightCondensedCopyrightMap.keySet()) {
                if (!loggedLicense) {
                    log(resultLogFile, ">>> " + license + " <<<");
                    loggedLicense = true;
                }
                log(resultLogFile, "  >> " + copyright + " <<");

                List condensedList = copyrightCondensedCopyrightMap.get(copyright);

                for (String cr : condensedList) {
                    log(resultLogFile, "    > " + cr);
                    List filesCovered = copyrightFileMap.get(cr);
                    for (String file : filesCovered) {
                        String path = FileUtils.asRelativePath(intermediateFolder, new File(analysisDir, file));
                        log(resultLogFile, "        " + scancodeInput + "/" + path);
                    }
                    log(resultLogFile, "");
                }
            }
        }
        matchScancodeResultToMetascanResult(artifact, sourceDir, scancodeInput);

        return true;
    }

    private void matchScancodeResultToMetascanResult(Artifact artifact, File sourceDir, String scanCodeInput) throws IOException {
        // input
        final File targetFolder = new File(sourceDir.getParentFile(), sourceDir.getName() + "-analysis");
        // output
        final String filename = artifact.getId().replace("/", "_");
        final File copyrightLicenseAssignmentFile = new File(targetFolder, filename + "_copyright-license-assignment.txt");
        final File variableExtractionFile = new File(targetFolder, filename + "_license-variable-extractions.txt");
        final File scancodeMissmatchFolder = new File(targetFolder, filename + "-scancode-missmatch-issues");

        if (scancodeMissmatchFolder.exists()) FileUtils.deleteDir(scancodeMissmatchFolder);

        MergedScanResult mergedResult = new MergedScanResult();
        mergedResult.mergeResults(artifact, sourceDir);

        copyrightLicenseAssignmentFile(mergedResult, copyrightLicenseAssignmentFile, scanCodeInput);
        variableExtractionFile(mergedResult, variableExtractionFile, scanCodeInput);
        metaScanScanCodeMissmatch(mergedResult, scancodeMissmatchFolder);
    }

    private void copyrightLicenseAssignmentFile(MergedScanResult mergedResult, File copyrightLicenseAssignmentFile, String scancodeInput) throws IOException {
        init(copyrightLicenseAssignmentFile, "Copyright/License Segmentation");
        log(copyrightLicenseAssignmentFile, "");

        HashMap> groupedResult = new HashMap<>();
        List mergedScanResult = mergedResult.getMergedScanResult();
        if (mergedScanResult != null) {
            for (MergedSegmentResult mergedSegmentResult : mergedScanResult) {

                String licenseResult = InventoryUtils.joinLicenses(mergedSegmentResult.getResolvedLicenses());
                String scancodeLicensesResult = InventoryUtils.joinLicenses(mergedSegmentResult.getScancodeLicenses());
                String copyrightsResults = InventoryUtils.joinLicenses(mergedSegmentResult.getCopyrights());

                if (licenseResult != null || scancodeLicensesResult != null || copyrightsResults != null) {
                    JSONObject json = new JSONObject();
                    json.put("licenseResult", StringUtils.notNull(licenseResult));
                    json.put("scancodeLicenseResult", StringUtils.notNull(scancodeLicensesResult));
                    json.put("copyrightResult", StringUtils.notNull(copyrightsResults));

                    final String key = json.toString();
                    if (groupedResult.get(key) != null) {
                        groupedResult.get(key).add(scancodeInput + "/" + mergedSegmentResult.getPath());
                    } else {
                        List list = new ArrayList<>();
                        list.add(scancodeInput + "/" + mergedSegmentResult.getPath());
                        groupedResult.put(key, list);
                    }
                }
            }
            // FIXME: differentiate markers, filtered licenses, ignored licenses
            for (String result : groupedResult.keySet()) {
                final JSONObject json = new JSONObject(result);
                log(copyrightLicenseAssignmentFile, "-----");
                log(copyrightLicenseAssignmentFile, "");
                log(copyrightLicenseAssignmentFile, "    resolvedLicenses:   " + json.getString("licenseResult"));
                log(copyrightLicenseAssignmentFile, "    scancodeLicenses:   " + json.getString("scancodeLicenseResult"));
                log(copyrightLicenseAssignmentFile, "    scancodeCopyrights: " + json.getString("copyrightResult"));
                log(copyrightLicenseAssignmentFile, "");
                log(copyrightLicenseAssignmentFile, "These results were found in the following files:");

                for (String file : groupedResult.get(result)) {
                    log(copyrightLicenseAssignmentFile, "  >" + file + "<");
                }
                log(copyrightLicenseAssignmentFile, "");
            }
        }
    }

    // FIXME: add documentation
    private void variableExtractionFile(MergedScanResult mergedResult, File variableExtractionFile, String scancodeInput) throws IOException {
        HashMap> variableFiles = new HashMap<>();
        HashMap> variableCopyrights = new HashMap<>();

        List mergedScanResult = mergedResult.getMergedScanResult();
        if (mergedScanResult != null) {
            for (MergedSegmentResult mergedSegmentResult : mergedResult.getMergedScanResult()) {
                List variables = mergedSegmentResult.getVariables();
                if (variables != null) {
                    for (Variables variable : variables) {
                        final String filepath = mergedSegmentResult.getPath();
                        if (filepath != null) {
                            final List copyrights = mergedSegmentResult.getCopyrights();


                            if (variable.listContains(new ArrayList<>(variableFiles.keySet()))) {
                                Variables key = variable.getFromList(new ArrayList<>(variableFiles.keySet()));
                                variableFiles.get(key).add(scancodeInput + "/" + filepath);
                                variableCopyrights.get(key).addAll(copyrights);
                            } else {
                                final List fileList = new ArrayList<>();
                                final List copyrightList = new ArrayList<>();
                                fileList.add(scancodeInput + "/" + filepath);
                                copyrightList.addAll(copyrights);
                                variableFiles.put(variable, fileList);
                                variableCopyrights.put(variable, copyrightList);
                            }
                        }
                    }
                }
            }
        }

        HashMap> licenseVariableList = new HashMap<>();
        for (Variables variable : variableFiles.keySet()) {
            String license = variable.getLicense();
            if (!licenseVariableList.containsKey(license)) {
                List variablesList = new ArrayList<>();
                variablesList.add(variable);
                licenseVariableList.put(license, variablesList);
            } else {
                licenseVariableList.get(license).add(variable);
            }
        }

        init(variableExtractionFile, "Variable Extraction File");
        for (String license : licenseVariableList.keySet()) {
            log(variableExtractionFile, "-------------------");
            log(variableExtractionFile, license + "\n");
            if (licenseVariableList.get(license).size() > 1) {
                log(variableExtractionFile, "The " + license + " has " + licenseVariableList.get(license).size() + " different variable sets.\n");
            }
            for (Variables variable : licenseVariableList.get(license)) {
                log(variableExtractionFile, "------");
                log(variableExtractionFile, "variables:");
                for (String s : variable.getValues().keySet()) {
                    log(variableExtractionFile, "  " + s + ": " + variable.getValues().get(s));
                }
                log(variableExtractionFile, "\nCopyrights associated with this license and variables:");
                for (String s : variableCopyrights.get(variable)) {
                    log(variableExtractionFile, "  - " + s);
                }
                log(variableExtractionFile, "\nThose variables were found in the following File/Segments:");
                for (String s : variableFiles.get(variable)) {
                    log(variableExtractionFile, "  - " + s);
                }
                log(variableExtractionFile, "\n");
            }
        }
    }

    private void metaScanScanCodeMissmatch(MergedScanResult mergedResult, File scancodeMissmatchFolder) throws IOException {
        for (MergedSegmentResult mergedSegmentResult : mergedResult.getMergedScanResult()) {
            List resolvedLicenses = mergedSegmentResult.getResolvedLicenses();
            Map scancodeLicenseKeysValue = mergedSegmentResult.getScancodeLicenseKeys();
            List scancodeLicenseKeys = new ArrayList<>(scancodeLicenseKeysValue.keySet());

            if (scancodeLicenseKeys.isEmpty()) continue;

            String filename = mergedSegmentResult.getPath().substring(mergedSegmentResult.getPath().indexOf("/"));

            if (resolvedLicenses.size() == 0) {
                File output = new File(scancodeMissmatchFolder, filename);
                init(output, "MetaScan result empty, while ScanCode matches:");
                for (String s : scancodeLicenseKeys) {
                    log(output, s);
                }
            } else {
                final NormalizationMetaData normalizationMetaData = getNormalizationMetaData();
                filterScancodeLicenseKeys(scancodeLicenseKeys);
                final List existingKeys = new ArrayList<>();
                for (String scancodeLicenseKey : scancodeLicenseKeys) {
                    final String key = "scancode:" + scancodeLicenseKey;
                    for (String resolvedLicense : resolvedLicenses) {
                        final TermsMetaData termsMetaData = normalizationMetaData.getTermsMetaData(resolvedLicense);
                        final List otherIds = termsMetaData != null ? termsMetaData.getOtherIds() : null;
                        if (otherIds != null) {
                            if (otherIds.contains(key)) {
                                existingKeys.add(scancodeLicenseKey);
                            }
                        } else {
                            if (termsMetaData != null && termsMetaData.getRepresentedAs() != null) {
                                List otherIdsRepresentedAs = normalizationMetaData.getTermsMetaData(termsMetaData.getRepresentedAs()).getOtherIds();
                                if (otherIdsRepresentedAs != null) {
                                    if (otherIdsRepresentedAs.contains(key)) {
                                        existingKeys.add(scancodeLicenseKey);
                                    }
                                }
                            }
                        }
                    }
                }
                scancodeLicenseKeys.removeAll(existingKeys);
                existingKeys.clear();
                for (String scancodeLicenseKey : scancodeLicenseKeys) {
                    if (scancodeLicenseKeysValue.get(scancodeLicenseKey) < 90) {
                        existingKeys.add(scancodeLicenseKey);
                    }
                }
                scancodeLicenseKeys.removeAll(existingKeys);
                if (scancodeLicenseKeys.size() > 0) {
                    File output = new File(scancodeMissmatchFolder, filename);
                    init(output, "Additional ScanCode matches detected:");
                    log(output, "MetaScan matches:" + org.apache.commons.lang3.StringUtils.join(resolvedLicenses, ", "));
                    log(output, "Additional matches by ScanCode:");
                    for (String s : scancodeLicenseKeys) {
                        log(output, s);
                    }
                }
            }
        }
    }

    private List filterScancodeLicenseKeys(List scancodeLicenseKeys) {
        String[] excludes = {"other-permissive", "proprietary-license", "public-domain", "unknown-license-reference", "unknown"};
        for (String exclude : excludes) {
            scancodeLicenseKeys.remove(exclude);
        }
        return scancodeLicenseKeys;
    }

    protected Map> invertMap(Map> keyListMap) {
        final Map> invertedMap = new HashMap<>();

        for (final Map.Entry> entry : keyListMap.entrySet()) {
            for (final String path : entry.getValue()) {
                List values = invertedMap.get(path);
                if (values == null) {
                    values = new ArrayList<>();
                    invertedMap.put(path, values);
                }
                if (!values.contains(entry.getKey())) {
                    values.add(entry.getKey());
                }
            }
        }
        return invertedMap;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy