All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.metaeffekt.artifact.analysis.spdxbom.LicenseStringConverter Maven / Gradle / Ivy

There is a newer version: 0.132.0
Show newest version
/*
 * Copyright 2021-2024 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.metaeffekt.artifact.analysis.spdxbom;

import com.metaeffekt.artifact.terms.model.NormalizationMetaData;
import com.metaeffekt.artifact.terms.model.TermsMetaData;
import lombok.Getter;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.metaeffekt.core.inventory.processor.model.Artifact;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.spdx.library.model.license.ListedLicenses;

import java.util.*;
import java.util.regex.Pattern;

/**
 * Contains methods to convert between license string and spdx expressions.
 */
public class LicenseStringConverter {
    private static final Logger LOG = LoggerFactory.getLogger(LicenseStringConverter.class);

    protected NormalizationMetaData normMeta;

    protected final Map spdxToLicenseStringAssessments;

    protected static final Pattern spdxOrPattern = Pattern.compile(" OR ", Pattern.CASE_INSENSITIVE);
    protected static final Pattern spdxAndPattern = Pattern.compile(" AND ", Pattern.CASE_INSENSITIVE);

    public static final int maxRecursionDepth = 64;

    private static final Pattern endsWithWithPattern = Pattern.compile("WITH *$");

    public static class ToSpdxResult {
        /**
         * The converted expression is the main result of conversion.
         */
        private final String convertedExpression;
        /**
         * A list of known license names for which a "LicenseRef" was added.
* These are returned so that classes using the converter may add the license texts to the spdx documents. * See {@link #referencedMissingTmd} for similar cases where tmd could not be found */ @Getter private final Set referencedLicenses; /** * Invalid license identifiers (as found).
* Not even tmd knew these identifiers. They therefore had to be added as LicenseRef-unknown-X. */ @Getter private final Set referencedMissingTmd; public ToSpdxResult( String convertedExpression, Set referencedLicenses, Set referencedMissingTmd) { this.convertedExpression = convertedExpression; this.referencedLicenses = referencedLicenses; this.referencedMissingTmd = referencedMissingTmd; } public String getExpression() { return convertedExpression; } } public LicenseStringConverter(NormalizationMetaData normalizationMetaData, Map spdxToLicenseStringAssessments) { Objects.requireNonNull(normalizationMetaData, "This class requires non-null NormalizationMetaData."); this.normMeta = normalizationMetaData; this.spdxToLicenseStringAssessments = spdxToLicenseStringAssessments == null ? new HashMap<>() : new HashMap<>(spdxToLicenseStringAssessments); } /** * Tries to find an assessment for the given expression.
* Useful if the expression can't be translated any other way.
* Note that these expressions are highly specific and don't respect logic equivalence. * * @param expression The expression to look for. * @return Returns a metaeffekt license string to replace the spdx expression. */ protected String findSpdxAssessment(String expression) { if (expression != null) { return spdxToLicenseStringAssessments.get(expression); } return null; } public ToSpdxResult licenseStringToSpdxExpression(String metaLicensesString) { // FIXME: why break logging by not providing an artifact? suggest better logging by removing this method? return licenseStringToSpdxExpression(metaLicensesString, new Artifact()); } protected TermsMetaData findTmdForCanonicalName(String inputName) { // resolve to valid tmd. trying to use curated spdx id or will use in licenseRef creation TermsMetaData resolvedMeta = normMeta.findTermsMetaData(inputName); // fallback to canonical name history if (resolvedMeta == null) { resolvedMeta = normMeta.findUsingCanonicalNameInHistory(inputName); } return resolvedMeta; } /** * Tries to create an spdx license expression from an input name.
* The input name is in tmd format and should be a canonicalName. * * @param inputName an ae input name, should be canonicalName. * @return returns a license expression that tries to represent the input name. */ protected String deriveSpdxRepresentation(final String inputName, Set referencedLicenses, Set referencedWithoutTmd) { TermsMetaData resolvedMeta = findTmdForCanonicalName(inputName); if (resolvedMeta == null) { // seems the system doesn't know tmd for this string // may be weird inputs from outdated input or declared licenses (e.g. "custom" in deb packages) LOG.warn("No terms metadata found for [{}].", inputName); referencedWithoutTmd.add(inputName); String tmdlessRef = LicenseStringUtils.createUnknownLicenseRef(inputName); LOG.debug("Created ref [{}] using raw name [{}].", tmdlessRef, inputName); return tmdlessRef; } return deriveLicenseExpressionFromTmd(resolvedMeta, referencedLicenses, referencedWithoutTmd); } /** * Converts an {@code artifact.getLicense()} string to an spdx expression (if possible) using TermsMetaData. * * @param metaLicensesString Input artifact license string. * @param artifact Corresponding artifact. Mostly used for more useful logging. * @return Returns the converted spdx expression. */ public ToSpdxResult licenseStringToSpdxExpression(String metaLicensesString, final Artifact artifact) { if (metaLicensesString == null) { return new ToSpdxResult(null, Collections.emptySet(), Collections.emptySet()); } else if (StringUtils.isBlank(metaLicensesString)) { return new ToSpdxResult("", Collections.emptySet(), Collections.emptySet()); } StringJoiner andJoiner = new StringJoiner(" AND "); String[] andSplit = metaLicensesString.split(","); // a list of license names that were converted to "LicenseRef" Set referencedLicenses = new HashSet<>(); Set referencedWithoutTmd = new HashSet<>(); for (String foundLicense : andSplit) { foundLicense = foundLicense.trim(); foundLicense = transformLicense(foundLicense); // split only if + is surrounded by spaces. just in case + is contained in a license name String[] licenseFound = foundLicense.trim().split(" \\+ "); StringJoiner orJoiner; if (licenseFound.length > 1 && andSplit.length > 1) { orJoiner = new StringJoiner(" OR ", "(", ")"); } else { orJoiner = new StringJoiner(" OR "); } // further splitting may be required with + (our OR) operator for (String licenseName : licenseFound) { // trim unneeded spaces licenseName = licenseName.trim(); if (StringUtils.isBlank(licenseName)) { // empty license string was found after splitting. this means that the input contains errors. throw new IllegalStateException("Invalid license string '" + metaLicensesString + "' (artifact '" + artifact.createStringRepresentation() + "') could not be parsed."); } if (licenseName.length() < 3) { LOG.info( "Suspicious input name [{}]. Bad license string [{}] in input inventory?", licenseName, metaLicensesString ); } String spdxRepresentation = deriveSpdxRepresentation(licenseName, referencedLicenses, referencedWithoutTmd); // bodges "... AND (exception id)" by fallback to ref, otherwise breaks spdx lib verification process if (ListedLicenses.getListedLicenses().isSpdxListedExceptionId(spdxRepresentation)) { TermsMetaData tmdToRef = findTmdForCanonicalName(licenseName); referencedLicenses.add(tmdToRef); spdxRepresentation = getLicenseRefForTmd(tmdToRef); } orJoiner.add(spdxRepresentation); } // sanity check generated text if (orJoiner.length() == 0) { throw new RuntimeException("Sanity check failed: Invalid empty orJoiner output for non-empty input."); } andJoiner.add(orJoiner.toString()); } return new ToSpdxResult(andJoiner.toString(), referencedLicenses, referencedWithoutTmd); } private String transformLicense(String foundLicense) { final TermsMetaData termsMetaData = normMeta.findUsingCanonicalNameInHistory(foundLicense); if (termsMetaData != null) return termsMetaData.getCanonicalName(); return foundLicense; } /** * Utility function specifically for spdxExpressionToLicenseString. * Recurses through all levels of parantheses and converts the lower levels. * * @param spdxExpressionSnippet The spdx expression that will be worked on. * @return A pair of the String with positional placeholders and values. Upon return, values have been processed. */ private Pair> recurseBrackets(String spdxExpressionSnippet, Artifact artifact) { // clean up empty brackets. those will then be used as placeholders for recursion. if (spdxExpressionSnippet.contains("()")) { LOG.warn("Spdx license expression contains '()' (empty brackets). This may lead to problems."); } List values = new ArrayList<>(); StringBuilder placeholderSnippet = new StringBuilder(); int added = 0; for (Map.Entry bracketPositions : LicenseStringUtils.findTopBrackets(spdxExpressionSnippet).entrySet()) { int contentBegin = bracketPositions.getKey(); int contentEnd = bracketPositions.getValue(); String sub = spdxExpressionSnippet.substring(contentBegin + 1, contentEnd); String convertedContent = spdxExpressionToLicenseString(sub, artifact); // update partially converted expression placeholderSnippet.append(spdxExpressionSnippet, added, contentBegin).append("()"); added = contentEnd + 1; values.add(convertedContent); } placeholderSnippet.append(spdxExpressionSnippet, added, spdxExpressionSnippet.length()); return Pair.of(placeholderSnippet.toString(), values); } public String spdxExpressionToLicenseString(String spdxExpression) { return spdxExpressionToLicenseString(spdxExpression, null); } /** * Rewrites an spdxExpression as an artifact "licenses" string. Some logic involving paranetheses may be lost. * * @param spdxExpression The expression to be converted. * @param artifact Corresponding artifact. Conversion errors may be recorded in its "Errors" field. * @return Returns the converted expression in a modified metaeffekt format. */ public String spdxExpressionToLicenseString(String spdxExpression, Artifact artifact) { if (spdxExpression == null) { return null; } else if (StringUtils.isBlank(spdxExpression)) { return ""; } // trim spdxExpression = spdxExpression.trim(); // manual assessments override all bit-by-bit conversion String lookupAssessment = findSpdxAssessment(spdxExpression); if (lookupAssessment != null) { return lookupAssessment; } // recurse through parantheses first Pair> recursionResult = recurseBrackets(spdxExpression, artifact); String expressionWithPlaceholders = recursionResult.getLeft(); List values = recursionResult.getRight(); // this string is now free of brackets (except for special "()" placeholders) // split according to spdx operator precedence, then join while respecting our (inverted) precedence rules String[] orParts = spdxOrPattern.split(expressionWithPlaceholders); StringJoiner orJoiner = new StringJoiner(" + "); for (String orPart : orParts) { String[] andParts = spdxAndPattern.split(orPart); StringJoiner andJoiner; // since our licenses string and spdx have differing order of precedence, we'll need brackets. // this is a modification of our usual rules, as usually brackets are forbidden. // insertion for placeholders relies on this detection being correct if (orParts.length > 1 && andParts.length > 1) { throw new RuntimeException("Logic incompatibility due to differing operator precedence: '" + orPart + "' in Expression '" + spdxExpression + "' would require bracketing. " + "Please provide a manual assessment for this exact expression."); } else { andJoiner = new StringJoiner(", "); } // final splitting, loop through license ids. for (String spdxPart : andParts) { // trim the license string String trimmed = spdxPart.trim(); // NOTE: a contained whitespace is not an indicator for an invalid spdx-identified. // The part may still contain expression parts. // handle the license id if (trimmed.equals("()")) { // handle placeholders left after recursing brackets // these will be literally inserted and brackets will be added if needed. String toInsert = values.remove(0); // enforce brackets if the next logic operator after brackets is "+" // they can be dropped otherwise due to the inherent properties of our operator precedence (+,). // they can also be dropped if in the brackets, or is the only immediately effective operator. if (LicenseStringUtils.containsTopAnd(toInsert) && andParts.length == 1 && orParts.length != 1) { // the license expression contains logic that can't be expressed in our system. // andJoiner.add(trimmed.replaceFirst(Pattern.quote("()"), "(" + toInsert + ")")); throw new RuntimeException("Cannot ignore brackets around '" + toInsert + "' inserted into '" + trimmed.replaceFirst(Pattern.quote("()"), "(INSERT_GOES_HERE)") + "'. " + "Logic may be incompatble between spdx and license strings."); } else { // we are able to simplify the bracketed expression in our logic system. simply insert. andJoiner.add(trimmed.replace("()", toInsert)); } } else if (trimmed.startsWith("LicenseRef-ae-")) { // TODO: support licenseRef of types other than "ae" in conversion back to our format // add LicenseRef result if it's valid String unwrappedLicenseRef = LicenseStringUtils.unwrapLicenseRef(trimmed); // validate found LicenseRefs for error reporting TermsMetaData foundMeta = normMeta.findByShortName(unwrappedLicenseRef); if (foundMeta == null) { // warn that we're generating invalid license names LOG.warn("Unknown license short name [{}].", unwrappedLicenseRef); addError(artifact, "No tmd '" + unwrappedLicenseRef + "'"); } andJoiner.add(unwrappedLicenseRef); } else { // this string part may be a regular Spdx license id. // check if we have any data on this license id TermsMetaData foundMeta = normMeta.findTermsMetaData(trimmed); if (foundMeta == null) { String mapped = spdxToLicenseStringAssessments.get(trimmed); if (mapped == null) { mapped = "SpdxLicense-(" + trimmed + ")"; LOG.warn("Can't find spdx expression '" + trimmed + "' in tmd. Writing '" + mapped + "'."); addError(artifact, "No data for spdxId '" + trimmed + "'"); } // forcing translation by generating an invalid name andJoiner.add(mapped); } else { andJoiner.add(foundMeta.getCanonicalName()); } } } orJoiner.add(andJoiner.toString()); } return orJoiner.toString(); } /** * Simple utility method that adds an error to an artifact.
* Useful if the inventory itself is written afterwards for error checking (rarely used). * * @param artifact the artifact to append an error string to * @param errorString the error string to append to the error field */ protected static void addError(Artifact artifact, String errorString) { if (artifact != null) { artifact.append("Errors", errorString, ", "); } } /** * Gets a representative name of a license for licenseRef generation. * * @param tmd the base data for name generation * @return the name string based on tmd */ public static String getMetaeffektRepresentativeName(TermsMetaData tmd) { // our license ids will use short names for better readability String name = tmd.getShortName(); // sanity check if (StringUtils.isBlank(name)) { LOG.warn( "Could not create meta representation for tmd [{}]. Indicates a processing error or invalid tmd.", tmd.getCanonicalName() ); } return name; } protected String parseProtoExpressionBottomLevel(String protoExpression, String canonicalName, Set referencedLicenses, Set referencedWithoutTmd, int recursionDepth) { // strip surrounding brackets and resolve string literal. String nameForPlaceholderBracket; if (protoExpression.startsWith("[") && protoExpression.endsWith("]")) { nameForPlaceholderBracket = protoExpression.substring(1, protoExpression.length() - 1); } else { nameForPlaceholderBracket = protoExpression; } // find tmd of this placeholder. we do a custom job here to better warn about outdated names TermsMetaData resolvedTmdForPlaceholder = normMeta.findTermsMetaData(nameForPlaceholderBracket); if (resolvedTmdForPlaceholder == null) { resolvedTmdForPlaceholder = normMeta.findUsingCanonicalNameInHistory(nameForPlaceholderBracket); if (resolvedTmdForPlaceholder != null) { // logging here since expressions should be squeaky clean but tmd doesn't test for this yet LOG.info( "Old canonicalName [{}] still used in expression of tmd [{}].", nameForPlaceholderBracket, canonicalName ); } } if (resolvedTmdForPlaceholder == null) { LOG.warn("No terms metadata found for [{}] referenced in spdxExpression of tmd [{}].", nameForPlaceholderBracket, canonicalName ); // we have no choice but to fail if we can't create a correct expression from the proto referencedWithoutTmd.add(nameForPlaceholderBracket); // abort and let the algorithm use another type of identifier return null; } return deriveLicenseExpressionFromTmd( resolvedTmdForPlaceholder, referencedLicenses, referencedWithoutTmd, recursionDepth + 1 ); } /** * Tries to replace all placeholder names in the proto expression.
* Square brackets are found recursively and replaced with tmd-compatible identifiers. * * @param protoExpression the expression construct from tmd * @param canonicalName name to produce better error logs if something goes wrong * @param referencedLicenses method will add to this modifiable list of referenced licenses * @param referencedWithoutTmd method will add to this modifiable list of licenses that don't have tmd * @param recursionDepth the amount of times this method has been called recursively to prevent infinite * loops. Should be 0 on first call, will stop when value reaches * {@link #maxRecursionDepth} * @return returns an spdx expression with identifiers inserted or null on (logic) error */ public String parseProtoExpression(String protoExpression, String canonicalName, Set referencedLicenses, Set referencedWithoutTmd, int recursionDepth) { if (protoExpression == null) { return null; } if (recursionDepth > maxRecursionDepth) { throw new RuntimeException("Recursion depth exceeded while parsing expression [{}] in [{}]."); } NavigableMap topBrackets = LicenseStringUtils.findTopBrackets(protoExpression, '[', ']'); if (topBrackets.size() <= 1) { return parseProtoExpressionBottomLevel( protoExpression, canonicalName, referencedLicenses, referencedWithoutTmd, recursionDepth + 1 ); } // otherwise continue to resolve brackets until we reach the bottom level of top brackets. StringBuilder outputExpression = new StringBuilder(); int lastEnd = 0; for (int start : topBrackets.keySet()) { outputExpression.append(protoExpression, lastEnd, start); int end = topBrackets.get(start); String topBracketContent = protoExpression.substring(start + 1, end); boolean isAfterWith = endsWithWithPattern.matcher(outputExpression.toString()).find(); String parsedSubstring = parseProtoExpression( topBracketContent, canonicalName, referencedLicenses, referencedWithoutTmd, recursionDepth + 1 ); if (parsedSubstring == null) { return null; } // FIXME: find a better way to deal with the issue of spdx not allowing licenseRef after WITH if (isAfterWith && parsedSubstring.startsWith("LicenseRef-")) { // doesn't allow license ref after WITH. just yeet null and use the combo identifier as a raw ref. return null; } outputExpression.append(parsedSubstring); // skip the bracket as well lastEnd = end + 1; } outputExpression.append(protoExpression, lastEnd, protoExpression.length()); return outputExpression.toString(); } /** * Represents a tmd as an spdx-compatible license expression. * * @param tmd the tmd to turn into a license expression * @param referencedLicenses modifiable set that keeps track of referenced licenses * @param referencedWithoutTmd modifiable set that keeps track of references without a valid tmd * @return returns a license expression * @see #deriveSpdxRepresentation */ public String deriveLicenseExpressionFromTmd(TermsMetaData tmd, Set referencedLicenses, Set referencedWithoutTmd) { return deriveLicenseExpressionFromTmd(tmd, referencedLicenses, referencedWithoutTmd, 0); } public String getLicenseRefForTmd(TermsMetaData tmd) { String expressionOrId = null; // FIXME: scancode fallback is causing a loss of information such as with: // - "scancode:bsd-original" // loss of information is what we were trying to minimize here, so we should change our logic! // intention is to fallback to public namespaces if possible // fallback scancode namespace String nullableScancodeId = tmd.getOtherId("scancode"); if (StringUtils.isNotBlank(nullableScancodeId)) { expressionOrId = LicenseStringUtils.createScancodeLicenseRef(nullableScancodeId); } // fallback metaeffekt namespace if (StringUtils.isBlank(expressionOrId)) { String representativeName = getMetaeffektRepresentativeName(tmd); if (StringUtils.isNotBlank(representativeName)) { expressionOrId = LicenseStringUtils.createAeLicenseRef(representativeName); } } // BODGE: fallback to spdx namespace licenseref if (StringUtils.isBlank(expressionOrId)) { // falling back to spdx identifier. this happens if an id exists but it's used in the wrong context like // "... AND (exception id)" or "... WITH licenseRef", since spdx 2.3 doesn't support those cases at all. // the current library also throws nasty errors if you try it anyway (but only at serialization). String spdxIdentifier = tmd.getSpdxIdentifier(); if (StringUtils.isNotBlank(spdxIdentifier)) { expressionOrId = LicenseStringUtils.createSpdxLicenseRef(spdxIdentifier); } } // sanity check: something is really broken if it's still empty, we shouldn't leave any cases like this if (StringUtils.isBlank(expressionOrId)) { LOG.error("Sanity check failed: will return empty license expression (should never happen)."); } // TODO: i was told that either shortName or spdxIdentifier need to exist? // otherwise we maybe should generate -unknown- licenseRef here, also, which might prevent errors return expressionOrId; } /** * Represents a tmd as an spdx-compatible license expression. * * @param tmd the tmd to turn into a license expression * @param referencedLicenses modifiable set that keeps track of referenced licenses * @param referencedWithoutTmd modifiable set that keeps track of references without a valid tmd * @param recursionDepth to prevent the horribly complex {@link #parseProtoExpression} methods from looping * @return returns a license expression * @see #deriveSpdxRepresentation */ public String deriveLicenseExpressionFromTmd( TermsMetaData tmd, Set referencedLicenses, Set referencedWithoutTmd, int recursionDepth ) { Objects.requireNonNull(tmd); // trace for debugging in case a tmd loops LOG.trace( "deriving spdx expression from tmd [{}] at recursion depth [{}].", tmd.getCanonicalName(), recursionDepth ); // first try to use a curated spdx value from tmd String expressionOrId = tmd.getSpdxIdentifier(); if (StringUtils.isNotBlank(expressionOrId) && !ListedLicenses.getListedLicenses().getSpdxListedLicenseIds().contains(expressionOrId) && !ListedLicenses.getListedLicenses().isSpdxListedExceptionId(expressionOrId)) { LOG.warn( "The supposed spdxIdentifier [{}] does not exist according to the SPDX library. Using other id.", expressionOrId ); expressionOrId = null; } if (StringUtils.isBlank(expressionOrId)) { if (tmd.getSpdxExpression() != null) { expressionOrId = parseProtoExpression( tmd.getSpdxExpression(), tmd.getCanonicalName(), referencedLicenses, referencedWithoutTmd, recursionDepth + 1 ); // returns null on error if (StringUtils.isNotBlank(expressionOrId)) { expressionOrId = "(" + expressionOrId + ")"; } } } if (StringUtils.isBlank(expressionOrId)) { expressionOrId = getLicenseRefForTmd(tmd); referencedLicenses.add(tmd); } return expressionOrId; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy