com.metaeffekt.artifact.analysis.spdxbom.LicenseStringConverter Maven / Gradle / Ivy
/*
* Copyright 2021-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metaeffekt.artifact.analysis.spdxbom;
import com.metaeffekt.artifact.terms.model.NormalizationMetaData;
import com.metaeffekt.artifact.terms.model.TermsMetaData;
import lombok.Getter;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.metaeffekt.core.inventory.processor.model.Artifact;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.spdx.library.model.license.ListedLicenses;
import java.util.*;
import java.util.regex.Pattern;
/**
* Contains methods to convert between license string and spdx expressions.
*/
public class LicenseStringConverter {
private static final Logger LOG = LoggerFactory.getLogger(LicenseStringConverter.class);
protected NormalizationMetaData normMeta;
protected final Map spdxToLicenseStringAssessments;
protected static final Pattern spdxOrPattern = Pattern.compile(" OR ", Pattern.CASE_INSENSITIVE);
protected static final Pattern spdxAndPattern = Pattern.compile(" AND ", Pattern.CASE_INSENSITIVE);
public static final int maxRecursionDepth = 64;
private static final Pattern endsWithWithPattern = Pattern.compile("WITH *$");
public static class ToSpdxResult {
/**
* The converted expression is the main result of conversion.
*/
private final String convertedExpression;
/**
* A list of known license names for which a "LicenseRef" was added.
* These are returned so that classes using the converter may add the license texts to the spdx documents.
* See {@link #referencedMissingTmd} for similar cases where tmd could not be found
*/
@Getter
private final Set referencedLicenses;
/**
* Invalid license identifiers (as found).
* Not even tmd knew these identifiers. They therefore had to be added as LicenseRef-unknown-X.
*/
@Getter
private final Set referencedMissingTmd;
public ToSpdxResult(
String convertedExpression,
Set referencedLicenses,
Set referencedMissingTmd) {
this.convertedExpression = convertedExpression;
this.referencedLicenses = referencedLicenses;
this.referencedMissingTmd = referencedMissingTmd;
}
public String getExpression() {
return convertedExpression;
}
}
public LicenseStringConverter(NormalizationMetaData normalizationMetaData,
Map spdxToLicenseStringAssessments) {
Objects.requireNonNull(normalizationMetaData, "This class requires non-null NormalizationMetaData.");
this.normMeta = normalizationMetaData;
this.spdxToLicenseStringAssessments = spdxToLicenseStringAssessments == null ?
new HashMap<>() :
new HashMap<>(spdxToLicenseStringAssessments);
}
/**
* Tries to find an assessment for the given expression.
* Useful if the expression can't be translated any other way.
* Note that these expressions are highly specific and don't respect logic equivalence.
*
* @param expression The expression to look for.
* @return Returns a metaeffekt license string to replace the spdx expression.
*/
protected String findSpdxAssessment(String expression) {
if (expression != null) {
return spdxToLicenseStringAssessments.get(expression);
}
return null;
}
public ToSpdxResult licenseStringToSpdxExpression(String metaLicensesString) {
// FIXME: why break logging by not providing an artifact? suggest better logging by removing this method?
return licenseStringToSpdxExpression(metaLicensesString, new Artifact());
}
protected TermsMetaData findTmdForCanonicalName(String inputName) {
// resolve to valid tmd. trying to use curated spdx id or will use in licenseRef creation
TermsMetaData resolvedMeta = normMeta.findTermsMetaData(inputName);
// fallback to canonical name history
if (resolvedMeta == null) {
resolvedMeta = normMeta.findUsingCanonicalNameInHistory(inputName);
}
return resolvedMeta;
}
/**
* Tries to create an spdx license expression from an input name.
* The input name is in tmd format and should be a canonicalName.
*
* @param inputName an ae input name, should be canonicalName.
* @return returns a license expression that tries to represent the input name.
*/
protected String deriveSpdxRepresentation(final String inputName,
Set referencedLicenses,
Set referencedWithoutTmd) {
TermsMetaData resolvedMeta = findTmdForCanonicalName(inputName);
if (resolvedMeta == null) {
// seems the system doesn't know tmd for this string
// may be weird inputs from outdated input or declared licenses (e.g. "custom" in deb packages)
LOG.warn("No terms metadata found for [{}].", inputName);
referencedWithoutTmd.add(inputName);
String tmdlessRef = LicenseStringUtils.createUnknownLicenseRef(inputName);
LOG.debug("Created ref [{}] using raw name [{}].", tmdlessRef, inputName);
return tmdlessRef;
}
return deriveLicenseExpressionFromTmd(resolvedMeta, referencedLicenses, referencedWithoutTmd);
}
/**
* Converts an {@code artifact.getLicense()} string to an spdx expression (if possible) using TermsMetaData.
*
* @param metaLicensesString Input artifact license string.
* @param artifact Corresponding artifact. Mostly used for more useful logging.
* @return Returns the converted spdx expression.
*/
public ToSpdxResult licenseStringToSpdxExpression(String metaLicensesString, final Artifact artifact) {
if (metaLicensesString == null) {
return new ToSpdxResult(null, Collections.emptySet(), Collections.emptySet());
} else if (StringUtils.isBlank(metaLicensesString)) {
return new ToSpdxResult("", Collections.emptySet(), Collections.emptySet());
}
StringJoiner andJoiner = new StringJoiner(" AND ");
String[] andSplit = metaLicensesString.split(",");
// a list of license names that were converted to "LicenseRef"
Set referencedLicenses = new HashSet<>();
Set referencedWithoutTmd = new HashSet<>();
for (String foundLicense : andSplit) {
foundLicense = foundLicense.trim();
foundLicense = transformLicense(foundLicense);
// split only if + is surrounded by spaces. just in case + is contained in a license name
String[] licenseFound = foundLicense.trim().split(" \\+ ");
StringJoiner orJoiner;
if (licenseFound.length > 1 && andSplit.length > 1) {
orJoiner = new StringJoiner(" OR ", "(", ")");
} else {
orJoiner = new StringJoiner(" OR ");
}
// further splitting may be required with + (our OR) operator
for (String licenseName : licenseFound) {
// trim unneeded spaces
licenseName = licenseName.trim();
if (StringUtils.isBlank(licenseName)) {
// empty license string was found after splitting. this means that the input contains errors.
throw new IllegalStateException("Invalid license string '" + metaLicensesString
+ "' (artifact '" + artifact.createStringRepresentation() + "') could not be parsed.");
}
if (licenseName.length() < 3) {
LOG.info(
"Suspicious input name [{}]. Bad license string [{}] in input inventory?",
licenseName,
metaLicensesString
);
}
String spdxRepresentation =
deriveSpdxRepresentation(licenseName, referencedLicenses, referencedWithoutTmd);
// bodges "... AND (exception id)" by fallback to ref, otherwise breaks spdx lib verification process
if (ListedLicenses.getListedLicenses().isSpdxListedExceptionId(spdxRepresentation)) {
TermsMetaData tmdToRef = findTmdForCanonicalName(licenseName);
referencedLicenses.add(tmdToRef);
spdxRepresentation = getLicenseRefForTmd(tmdToRef);
}
orJoiner.add(spdxRepresentation);
}
// sanity check generated text
if (orJoiner.length() == 0) {
throw new RuntimeException("Sanity check failed: Invalid empty orJoiner output for non-empty input.");
}
andJoiner.add(orJoiner.toString());
}
return new ToSpdxResult(andJoiner.toString(), referencedLicenses, referencedWithoutTmd);
}
private String transformLicense(String foundLicense) {
final TermsMetaData termsMetaData = normMeta.findUsingCanonicalNameInHistory(foundLicense);
if (termsMetaData != null) return termsMetaData.getCanonicalName();
return foundLicense;
}
/**
* Utility function specifically for spdxExpressionToLicenseString.
* Recurses through all levels of parantheses and converts the lower levels.
*
* @param spdxExpressionSnippet The spdx expression that will be worked on.
* @return A pair of the String with positional placeholders and values. Upon return, values have been processed.
*/
private Pair> recurseBrackets(String spdxExpressionSnippet, Artifact artifact) {
// clean up empty brackets. those will then be used as placeholders for recursion.
if (spdxExpressionSnippet.contains("()")) {
LOG.warn("Spdx license expression contains '()' (empty brackets). This may lead to problems.");
}
List values = new ArrayList<>();
StringBuilder placeholderSnippet = new StringBuilder();
int added = 0;
for (Map.Entry bracketPositions : LicenseStringUtils.findTopBrackets(spdxExpressionSnippet).entrySet()) {
int contentBegin = bracketPositions.getKey();
int contentEnd = bracketPositions.getValue();
String sub = spdxExpressionSnippet.substring(contentBegin + 1, contentEnd);
String convertedContent = spdxExpressionToLicenseString(sub, artifact);
// update partially converted expression
placeholderSnippet.append(spdxExpressionSnippet, added, contentBegin).append("()");
added = contentEnd + 1;
values.add(convertedContent);
}
placeholderSnippet.append(spdxExpressionSnippet, added, spdxExpressionSnippet.length());
return Pair.of(placeholderSnippet.toString(), values);
}
public String spdxExpressionToLicenseString(String spdxExpression) {
return spdxExpressionToLicenseString(spdxExpression, null);
}
/**
* Rewrites an spdxExpression as an artifact "licenses" string. Some logic involving paranetheses may be lost.
*
* @param spdxExpression The expression to be converted.
* @param artifact Corresponding artifact. Conversion errors may be recorded in its "Errors" field.
* @return Returns the converted expression in a modified metaeffekt format.
*/
public String spdxExpressionToLicenseString(String spdxExpression, Artifact artifact) {
if (spdxExpression == null) {
return null;
} else if (StringUtils.isBlank(spdxExpression)) {
return "";
}
// trim
spdxExpression = spdxExpression.trim();
// manual assessments override all bit-by-bit conversion
String lookupAssessment = findSpdxAssessment(spdxExpression);
if (lookupAssessment != null) {
return lookupAssessment;
}
// recurse through parantheses first
Pair> recursionResult = recurseBrackets(spdxExpression, artifact);
String expressionWithPlaceholders = recursionResult.getLeft();
List values = recursionResult.getRight();
// this string is now free of brackets (except for special "()" placeholders)
// split according to spdx operator precedence, then join while respecting our (inverted) precedence rules
String[] orParts = spdxOrPattern.split(expressionWithPlaceholders);
StringJoiner orJoiner = new StringJoiner(" + ");
for (String orPart : orParts) {
String[] andParts = spdxAndPattern.split(orPart);
StringJoiner andJoiner;
// since our licenses string and spdx have differing order of precedence, we'll need brackets.
// this is a modification of our usual rules, as usually brackets are forbidden.
// insertion for placeholders relies on this detection being correct
if (orParts.length > 1 && andParts.length > 1) {
throw new RuntimeException("Logic incompatibility due to differing operator precedence: '"
+ orPart + "' in Expression '" + spdxExpression + "' would require bracketing. "
+ "Please provide a manual assessment for this exact expression.");
} else {
andJoiner = new StringJoiner(", ");
}
// final splitting, loop through license ids.
for (String spdxPart : andParts) {
// trim the license string
String trimmed = spdxPart.trim();
// NOTE: a contained whitespace is not an indicator for an invalid spdx-identified.
// The part may still contain expression parts.
// handle the license id
if (trimmed.equals("()")) {
// handle placeholders left after recursing brackets
// these will be literally inserted and brackets will be added if needed.
String toInsert = values.remove(0);
// enforce brackets if the next logic operator after brackets is "+"
// they can be dropped otherwise due to the inherent properties of our operator precedence (+,).
// they can also be dropped if in the brackets, or is the only immediately effective operator.
if (LicenseStringUtils.containsTopAnd(toInsert) && andParts.length == 1 && orParts.length != 1) {
// the license expression contains logic that can't be expressed in our system.
// andJoiner.add(trimmed.replaceFirst(Pattern.quote("()"), "(" + toInsert + ")"));
throw new RuntimeException("Cannot ignore brackets around '" + toInsert + "' inserted into '"
+ trimmed.replaceFirst(Pattern.quote("()"), "(INSERT_GOES_HERE)") + "'. "
+ "Logic may be incompatble between spdx and license strings.");
} else {
// we are able to simplify the bracketed expression in our logic system. simply insert.
andJoiner.add(trimmed.replace("()", toInsert));
}
} else if (trimmed.startsWith("LicenseRef-ae-")) {
// TODO: support licenseRef of types other than "ae" in conversion back to our format
// add LicenseRef result if it's valid
String unwrappedLicenseRef = LicenseStringUtils.unwrapLicenseRef(trimmed);
// validate found LicenseRefs for error reporting
TermsMetaData foundMeta = normMeta.findByShortName(unwrappedLicenseRef);
if (foundMeta == null) {
// warn that we're generating invalid license names
LOG.warn("Unknown license short name [{}].", unwrappedLicenseRef);
addError(artifact, "No tmd '" + unwrappedLicenseRef + "'");
}
andJoiner.add(unwrappedLicenseRef);
} else {
// this string part may be a regular Spdx license id.
// check if we have any data on this license id
TermsMetaData foundMeta = normMeta.findTermsMetaData(trimmed);
if (foundMeta == null) {
String mapped = spdxToLicenseStringAssessments.get(trimmed);
if (mapped == null) {
mapped = "SpdxLicense-(" + trimmed + ")";
LOG.warn("Can't find spdx expression '" + trimmed + "' in tmd. Writing '" + mapped + "'.");
addError(artifact, "No data for spdxId '" + trimmed + "'");
}
// forcing translation by generating an invalid name
andJoiner.add(mapped);
} else {
andJoiner.add(foundMeta.getCanonicalName());
}
}
}
orJoiner.add(andJoiner.toString());
}
return orJoiner.toString();
}
/**
* Simple utility method that adds an error to an artifact.
* Useful if the inventory itself is written afterwards for error checking (rarely used).
*
* @param artifact the artifact to append an error string to
* @param errorString the error string to append to the error field
*/
protected static void addError(Artifact artifact, String errorString) {
if (artifact != null) {
artifact.append("Errors", errorString, ", ");
}
}
/**
* Gets a representative name of a license for licenseRef generation.
*
* @param tmd the base data for name generation
* @return the name string based on tmd
*/
public static String getMetaeffektRepresentativeName(TermsMetaData tmd) {
// our license ids will use short names for better readability
String name = tmd.getShortName();
// sanity check
if (StringUtils.isBlank(name)) {
LOG.warn(
"Could not create meta representation for tmd [{}]. Indicates a processing error or invalid tmd.",
tmd.getCanonicalName()
);
}
return name;
}
protected String parseProtoExpressionBottomLevel(String protoExpression,
String canonicalName,
Set referencedLicenses,
Set referencedWithoutTmd,
int recursionDepth) {
// strip surrounding brackets and resolve string literal.
String nameForPlaceholderBracket;
if (protoExpression.startsWith("[") && protoExpression.endsWith("]")) {
nameForPlaceholderBracket = protoExpression.substring(1, protoExpression.length() - 1);
} else {
nameForPlaceholderBracket = protoExpression;
}
// find tmd of this placeholder. we do a custom job here to better warn about outdated names
TermsMetaData resolvedTmdForPlaceholder = normMeta.findTermsMetaData(nameForPlaceholderBracket);
if (resolvedTmdForPlaceholder == null) {
resolvedTmdForPlaceholder = normMeta.findUsingCanonicalNameInHistory(nameForPlaceholderBracket);
if (resolvedTmdForPlaceholder != null) {
// logging here since expressions should be squeaky clean but tmd doesn't test for this yet
LOG.info(
"Old canonicalName [{}] still used in expression of tmd [{}].",
nameForPlaceholderBracket,
canonicalName
);
}
}
if (resolvedTmdForPlaceholder == null) {
LOG.warn("No terms metadata found for [{}] referenced in spdxExpression of tmd [{}].",
nameForPlaceholderBracket,
canonicalName
);
// we have no choice but to fail if we can't create a correct expression from the proto
referencedWithoutTmd.add(nameForPlaceholderBracket);
// abort and let the algorithm use another type of identifier
return null;
}
return deriveLicenseExpressionFromTmd(
resolvedTmdForPlaceholder,
referencedLicenses,
referencedWithoutTmd,
recursionDepth + 1
);
}
/**
* Tries to replace all placeholder names in the proto expression.
* Square brackets are found recursively and replaced with tmd-compatible identifiers.
*
* @param protoExpression the expression construct from tmd
* @param canonicalName name to produce better error logs if something goes wrong
* @param referencedLicenses method will add to this modifiable list of referenced licenses
* @param referencedWithoutTmd method will add to this modifiable list of licenses that don't have tmd
* @param recursionDepth the amount of times this method has been called recursively to prevent infinite
* loops. Should be 0 on first call, will stop when value reaches
* {@link #maxRecursionDepth}
* @return returns an spdx expression with identifiers inserted or null on (logic) error
*/
public String parseProtoExpression(String protoExpression,
String canonicalName,
Set referencedLicenses,
Set referencedWithoutTmd,
int recursionDepth) {
if (protoExpression == null) {
return null;
}
if (recursionDepth > maxRecursionDepth) {
throw new RuntimeException("Recursion depth exceeded while parsing expression [{}] in [{}].");
}
NavigableMap topBrackets = LicenseStringUtils.findTopBrackets(protoExpression, '[', ']');
if (topBrackets.size() <= 1) {
return parseProtoExpressionBottomLevel(
protoExpression,
canonicalName,
referencedLicenses,
referencedWithoutTmd,
recursionDepth + 1
);
}
// otherwise continue to resolve brackets until we reach the bottom level of top brackets.
StringBuilder outputExpression = new StringBuilder();
int lastEnd = 0;
for (int start : topBrackets.keySet()) {
outputExpression.append(protoExpression, lastEnd, start);
int end = topBrackets.get(start);
String topBracketContent = protoExpression.substring(start + 1, end);
boolean isAfterWith = endsWithWithPattern.matcher(outputExpression.toString()).find();
String parsedSubstring = parseProtoExpression(
topBracketContent,
canonicalName,
referencedLicenses,
referencedWithoutTmd,
recursionDepth + 1
);
if (parsedSubstring == null) {
return null;
}
// FIXME: find a better way to deal with the issue of spdx not allowing licenseRef after WITH
if (isAfterWith && parsedSubstring.startsWith("LicenseRef-")) {
// doesn't allow license ref after WITH. just yeet null and use the combo identifier as a raw ref.
return null;
}
outputExpression.append(parsedSubstring);
// skip the bracket as well
lastEnd = end + 1;
}
outputExpression.append(protoExpression, lastEnd, protoExpression.length());
return outputExpression.toString();
}
/**
* Represents a tmd as an spdx-compatible license expression.
*
* @param tmd the tmd to turn into a license expression
* @param referencedLicenses modifiable set that keeps track of referenced licenses
* @param referencedWithoutTmd modifiable set that keeps track of references without a valid tmd
* @return returns a license expression
* @see #deriveSpdxRepresentation
*/
public String deriveLicenseExpressionFromTmd(TermsMetaData tmd,
Set referencedLicenses,
Set referencedWithoutTmd) {
return deriveLicenseExpressionFromTmd(tmd, referencedLicenses, referencedWithoutTmd, 0);
}
public String getLicenseRefForTmd(TermsMetaData tmd) {
String expressionOrId = null;
// FIXME: scancode fallback is causing a loss of information such as with:
// - "scancode:bsd-original"
// loss of information is what we were trying to minimize here, so we should change our logic!
// intention is to fallback to public namespaces if possible
// fallback scancode namespace
String nullableScancodeId = tmd.getOtherId("scancode");
if (StringUtils.isNotBlank(nullableScancodeId)) {
expressionOrId = LicenseStringUtils.createScancodeLicenseRef(nullableScancodeId);
}
// fallback metaeffekt namespace
if (StringUtils.isBlank(expressionOrId)) {
String representativeName = getMetaeffektRepresentativeName(tmd);
if (StringUtils.isNotBlank(representativeName)) {
expressionOrId = LicenseStringUtils.createAeLicenseRef(representativeName);
}
}
// BODGE: fallback to spdx namespace licenseref
if (StringUtils.isBlank(expressionOrId)) {
// falling back to spdx identifier. this happens if an id exists but it's used in the wrong context like
// "... AND (exception id)" or "... WITH licenseRef", since spdx 2.3 doesn't support those cases at all.
// the current library also throws nasty errors if you try it anyway (but only at serialization).
String spdxIdentifier = tmd.getSpdxIdentifier();
if (StringUtils.isNotBlank(spdxIdentifier)) {
expressionOrId = LicenseStringUtils.createSpdxLicenseRef(spdxIdentifier);
}
}
// sanity check: something is really broken if it's still empty, we shouldn't leave any cases like this
if (StringUtils.isBlank(expressionOrId)) {
LOG.error("Sanity check failed: will return empty license expression (should never happen).");
}
// TODO: i was told that either shortName or spdxIdentifier need to exist?
// otherwise we maybe should generate -unknown- licenseRef here, also, which might prevent errors
return expressionOrId;
}
/**
* Represents a tmd as an spdx-compatible license expression.
*
* @param tmd the tmd to turn into a license expression
* @param referencedLicenses modifiable set that keeps track of referenced licenses
* @param referencedWithoutTmd modifiable set that keeps track of references without a valid tmd
* @param recursionDepth to prevent the horribly complex {@link #parseProtoExpression} methods from looping
* @return returns a license expression
* @see #deriveSpdxRepresentation
*/
public String deriveLicenseExpressionFromTmd(
TermsMetaData tmd,
Set referencedLicenses,
Set referencedWithoutTmd,
int recursionDepth
) {
Objects.requireNonNull(tmd);
// trace for debugging in case a tmd loops
LOG.trace(
"deriving spdx expression from tmd [{}] at recursion depth [{}].",
tmd.getCanonicalName(),
recursionDepth
);
// first try to use a curated spdx value from tmd
String expressionOrId = tmd.getSpdxIdentifier();
if (StringUtils.isNotBlank(expressionOrId) &&
!ListedLicenses.getListedLicenses().getSpdxListedLicenseIds().contains(expressionOrId) &&
!ListedLicenses.getListedLicenses().isSpdxListedExceptionId(expressionOrId)) {
LOG.warn(
"The supposed spdxIdentifier [{}] does not exist according to the SPDX library. Using other id.",
expressionOrId
);
expressionOrId = null;
}
if (StringUtils.isBlank(expressionOrId)) {
if (tmd.getSpdxExpression() != null) {
expressionOrId = parseProtoExpression(
tmd.getSpdxExpression(),
tmd.getCanonicalName(),
referencedLicenses,
referencedWithoutTmd,
recursionDepth + 1
);
// returns null on error
if (StringUtils.isNotBlank(expressionOrId)) {
expressionOrId = "(" + expressionOrId + ")";
}
}
}
if (StringUtils.isBlank(expressionOrId)) {
expressionOrId = getLicenseRefForTmd(tmd);
referencedLicenses.add(tmd);
}
return expressionOrId;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy