All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.metaeffekt.artifact.analysis.scancode.ScanCodeParser Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2021-2024 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.metaeffekt.artifact.analysis.scancode;

import com.metaeffekt.artifact.analysis.utils.FileUtils;
import com.metaeffekt.artifact.analysis.utils.StringUtils;
import com.metaeffekt.artifact.terms.model.TermsMetaData;
import org.apache.commons.lang3.tuple.Pair;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.AntPathMatcher;
import org.yaml.snakeyaml.Yaml;

import java.io.File;
import java.io.IOException;
import java.util.*;

public class ScanCodeParser {

    public final static Logger LOG = LoggerFactory.getLogger(ScanCodeParser.class);

    private final static AntPathMatcher ANT_PATH_MATCHER = new AntPathMatcher();

    public void scanCodeToLicenseMetaData(File licenseSrcDir, File rulesSrcDir, File targetBaseDir) throws IOException {
        final List licenseIds = parseLicenseIds(licenseSrcDir);

        // OPTIMIZE: we may want to optimize this; we repeat the scan for every license id;
        //   pull out of loop and filter when processing
        final String[] ruleYamlFiles = FileUtils.scanDirectoryForFiles(rulesSrcDir, "*.RULE");

        for (String licenseId : licenseIds) {
            final File licenseTargetDir = new File(new File(targetBaseDir, licenseId.substring(0, 1)), licenseId);

            // parse and write TermsMetaData
            final TermsMetaData termsMetaData = parseTermsMetaData(licenseId, licenseSrcDir, rulesSrcDir);
            File targetFile = new File(licenseTargetDir, "license.meta.yaml");
            termsMetaData.writeToFile(targetFile);

            // copy variant license texts for this license id (we parse the rules a second time)
            final Yaml yaml = new Yaml();

            for (String ruleFile : ruleYamlFiles) {
                if (ruleFile.equalsIgnoreCase(licenseId + ".RULE") || ruleFile.equalsIgnoreCase(licenseId + "_*.RULE")) {
                    final File ruleYamlFile = new File(rulesSrcDir, ruleFile);

                    final Pair content = readScanCodeMixedFormat(ruleYamlFile);

                    final ScanCodeRule scanCodeRule = yaml.loadAs(content.getLeft(), ScanCodeRule.class);
                    final String rule = content.getRight();
                    if (scanCodeRule.isIs_license_text()) {
                        File variantDst = new File(licenseTargetDir, "variants/" + ruleFile.replace(".RULE", ".txt"));
                        FileUtils.write(variantDst, rule, FileUtils.ENCODING_UTF_8);
                    }
                }
            }

            // copy license
            final File licenseFile = new File(licenseSrcDir, licenseId + ".LICENSE");
            final File targetLicenseFile = new File(licenseTargetDir, "license/" + licenseId + ".txt");
            if (licenseFile.exists()) {
                final Pair content = readScanCodeMixedFormat(licenseFile);
                FileUtils.writeStringToFile(targetLicenseFile, content.getRight().trim(), FileUtils.ENCODING_UTF_8);
            }
            LOG.info("Parsing scancode license {} completed.", licenseId);
        }
    }

    private Pair readScanCodeMixedFormat(File file) throws IOException {
        String fileContent = FileUtils.readFileToString(file, FileUtils.ENCODING_UTF_8);
        String intermediate = fileContent.replaceFirst("---\\n", "");
        final int index = intermediate.indexOf("\n---");
        String yamlPart = intermediate.substring(0, index);
        String content = intermediate.substring(index + 4).trim();
        return Pair.of(yamlPart, content);
    }

    public TermsMetaData parseTermsMetaData(String licenseId, File licenseBaseDir, File rulesBaseDir) throws IOException {
        LOG.info("Parsing scancode license {}...", licenseId);

        TermsMetaData termsMetaData = new TermsMetaData();
        final Yaml yaml = new Yaml();

        File licenseYamlFile = new File(licenseBaseDir, licenseId + ".LICENSE");

        if (!licenseYamlFile.exists()) {
            LOG.warn("Inconsistency in scancode versions. The license file [{}] could not be parsed. The file may not exist.", licenseYamlFile);
            return null;
        }

        final Pair licenseContent = readScanCodeMixedFormat(licenseYamlFile);

        ScanCodeLicense scanCodeLicense = yaml.loadAs(licenseContent.getLeft(), ScanCodeLicense.class);

        termsMetaData.setCanonicalName(scanCodeLicense.getName());
        termsMetaData.setCategory(scanCodeLicense.getName());
        termsMetaData.setSpdxIdentifier(scanCodeLicense.getSpdx_license_key());
        termsMetaData.addOtherId("scancode", licenseId);
        termsMetaData.setShortName(licenseId);
        termsMetaData.setType(scanCodeLicense.isIs_exception() ? "exception" : null);
        termsMetaData.setUrl(scanCodeLicense.getHomepage_url());
        termsMetaData.setClassification(mapCategoryToClassification(scanCodeLicense.getCategory()));
        if (scanCodeLicense.getOsi_license_key() != null) {
            termsMetaData.addOtherId("osi", scanCodeLicense.getOsi_license_key());
        }

        // add the rules
        final String[] ruleYamlFiles = FileUtils.scanDirectoryForFiles(rulesBaseDir, licenseId + ".RUlE", licenseId + "_*.RULE");
        for (String ruleFile : ruleYamlFiles) {
            File ruleYamlFile = new File(rulesBaseDir, ruleFile);

            final Pair ruleContent = readScanCodeMixedFormat(ruleYamlFile);

            ScanCodeRule scanCodeRule = yaml.loadAs(ruleContent.getLeft(), ScanCodeRule.class);
            if (scanCodeRule.getIs_license_reference() != null && scanCodeRule.getIs_license_reference().startsWith("yes")) {
                String ref = ruleContent.getRight().replace("\n", " ").trim();
                ref = ref.replace("\"", "\\\"");
                termsMetaData.getAlternativeNames().add(ref);
            }
        }

        termsMetaData.consolidateAlternativeNames();
        return termsMetaData;
    }

    private String mapCategoryToClassification(String category) {
        if ("permissive".equalsIgnoreCase(category)) return "permissive";
        if ("copyleft".equalsIgnoreCase(category)) return "copyleft";
        if ("copyleft limited".equalsIgnoreCase(category)) return "limited copyleft";
        if ("Proprietary Free".equalsIgnoreCase(category)) return "proprietary free";
        if ("commercial".equalsIgnoreCase(category)) return "commercial";
        if ("Free Restricted".equalsIgnoreCase(category)) return "restricted free";
        if ("Patent License".equalsIgnoreCase(category)) return "patent license";
        if ("Public Domain".equalsIgnoreCase(category)) return "public domain";
        if ("Source-available".equalsIgnoreCase(category)) return "source-available";
        LOG.warn("Cannot map unknown category: {}", category);
        return category;
    }

    protected List parseLicenseIds(File licenseSrcDir) {
        final List licenseIds = new ArrayList<>();
        final String[] licenseYamlFiles = FileUtils.scanDirectoryForFiles(licenseSrcDir, "**/*.LICENSE");
        for (String licenseYamlFile : licenseYamlFiles) {
            licenseIds.add(licenseYamlFile.substring(0, licenseYamlFile.lastIndexOf(".LICENSE")));
        }
        return licenseIds;
    }

    /**
     * Parses the outputs from scancode.
     *
     * @param result        String with output content.
     * @param licenseList   The list of licenses.
     * @param copyrightList The list of copyrights.
     * @param authorList    The list of authors.
     * @param includes      The strings to include.
     * @param excludes      The strings to exclude.
     * @throws JSONException If the result JSON string is invalid.
     */
    public static void parseScanCodeResult(final String result, TreeSet licenseList, TreeSet copyrightList, TreeSet authorList, String[] includes, String[] excludes) throws JSONException {

        // NOTE:
        // - the scan code result is filtered using includes and excludes patterns
        // - per default the analyze.metascan.license.includes and analyze.metascan.license.excludes are used to
        //   conform with the settings of Metascan.

        final JSONObject obj = new JSONObject(result);
        final JSONArray files = obj.getJSONArray("files");
        for (int i = 0; i < files.length(); i++) {
            final JSONObject file = files.getJSONObject(i);

            final String path = file.getString("path");

            boolean includeMatch = false;
            for (String includePattern : includes) {
                includeMatch |= ANT_PATH_MATCHER.match(includePattern, path);
                if (includeMatch) break;
            }

            // skip if not included
            if (!includeMatch) continue;

            boolean excludeMatch = false;
            for (String excludePattern : excludes) {
                excludeMatch |= ANT_PATH_MATCHER.match(excludePattern, path);
                if (includeMatch) break;
            }

            // skip if excluded
            if (excludeMatch) continue;

            List licenses = parseLicenses(file);
            licenseList.addAll(licenses);

            JSONArray copyrights = file.getJSONArray("copyrights");
            addValuesToList(copyrights, copyrightList, "copyright");

            JSONArray authors = file.getJSONArray("authors");
            addValuesToList(authors, authorList, "author");
        }
    }

    private static List parseLicenses(JSONObject file) {
        List aggregatedLicenseExpressions = new ArrayList<>();
        if (file.has("licenses")) {
            // pre 32 version
            JSONArray licenses = file.getJSONArray("licenses");
            for (int i = 0; i < licenses.length(); i++) {
                JSONObject licensesJSONObject = licenses.getJSONObject(i);
                String license = licensesJSONObject.getString("key");

                // not necessary, but just for incident
                final String[] split = license.split(" AND ");
                aggregatedLicenseExpressions.addAll(Arrays.asList(split));
            }
        } else {
            JSONArray licenseDetections = file.getJSONArray("license_detections");
            for (int i = 0; i < licenseDetections.length(); i++) {
                JSONObject licenseExpression = licenseDetections.getJSONObject(i);
                String expressionString = licenseExpression.getString("license_expression");
                final String[] split = expressionString.split(" AND ");
                aggregatedLicenseExpressions.addAll(Arrays.asList(split));
            }
        }
        return aggregatedLicenseExpressions;
    }

    private static void addValuesToList(JSONArray copyrights, TreeSet copyrightList, String key) throws JSONException {
        for (int j = 0; j < copyrights.length(); j++) {
            final JSONObject author = copyrights.getJSONObject(j);
            final String value = getFieldStringValue(author, key);
            if (!StringUtils.isEmpty(value)) {
                copyrightList.add(value);
            }
        }
    }

    public void parseScanCodeResult(String result, Map> licenseFileMap, Map> copyrightFileMap, Map> authorFileMap) throws JSONException {
        JSONObject obj = new JSONObject(result);
        JSONArray files = obj.getJSONArray("files");
        for (int i = 0; i < files.length(); i++) {
            JSONObject file = files.getJSONObject(i);

            final JSONArray copyrights = file.getJSONArray("copyrights");
            final String path = file.getString("path");
            for (int j = 0; j < copyrights.length(); j++) {
                JSONObject copyright = copyrights.getJSONObject(j);
                String value = getFieldStringValue(copyright, "copyright");
                List list = copyrightFileMap.get(value);
                if (list == null) {
                    list = new ArrayList<>();
                    copyrightFileMap.put(value, list);
                }
                if (!list.contains(path)) {
                    list.add(path);
                }
            }

            JSONArray authors = file.getJSONArray("authors");
            for (int j = 0; j < authors.length(); j++) {
                JSONObject author = authors.getJSONObject(j);
                String value = getFieldStringValue(author, "author");
                List list = authorFileMap.get(value);
                if (list == null) {
                    list = new ArrayList<>();
                    authorFileMap.put(value, list);
                }
                if (!list.contains(path)) {
                    list.add(path);
                }
            }

            List licenses = parseLicenses(file);
            for (String license : licenses) {
                final List list = licenseFileMap.computeIfAbsent(license, k -> new ArrayList<>());
                if (!list.contains(path)) {
                    list.add(path);
                }
            }
        }
    }

    private static String getFieldStringValue(JSONObject object, String key) throws JSONException {
        if (object.has(key)) {
            return object.getString(key);
        } else {
            // old format; no longer supported
            throw new IllegalStateException();
        }
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy