
toolkit.utils.ort-utils.42.1.0.source-code.CopyrightStatementsProcessor.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of ort-utils Show documentation
Show all versions of ort-utils Show documentation
Part of the OSS Review Toolkit (ORT), a suite to automate software compliance checks.
/*
* Copyright (C) 2017 The ORT Project Authors (see )
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SPDX-License-Identifier: Apache-2.0
* License-Filename: LICENSE
*/
package org.ossreviewtoolkit.utils.ort
import com.fasterxml.jackson.annotation.JsonIgnore
import com.fasterxml.jackson.annotation.JsonPropertyOrder
import com.fasterxml.jackson.databind.annotation.JsonSerialize
import org.ossreviewtoolkit.utils.common.StringSortedSetConverter
import org.ossreviewtoolkit.utils.common.collapseToRanges
import org.ossreviewtoolkit.utils.common.collapseWhitespace
import org.ossreviewtoolkit.utils.common.prettyPrintRanges
private val INVALID_OWNER_START_CHARS = charArrayOf(' ', ';', '.', ',', '-', '+', '~', '&')
private val INVALID_OWNER_KEY_CHARS = charArrayOf('<', '>', '(', ')', '[', ']') + INVALID_OWNER_START_CHARS
private const val YEAR_PLACEHOLDER = ""
private val COMMA_SEPARATED_YEARS_REGEX = "(?=.*)\\b(\\d{4})\\b( *, *)\\b(\\d{4})\\b".toRegex()
private val KNOWN_PREFIX_REGEX = listOf(
"^(?:\\([C|c]\\))",
"^(?:\\([C|c]\\) [C|c]opyright)",
"^(?:\\([C|c]\\) [C|c]opyrighted)",
"^(?:[C|c]opyright)",
"^(?:[C|c]opyright \\([C|c]\\))",
"^(?:[C|c]opyright [O|o]wnership)",
"^(?:[C|c]opyright')",
"^(?:[C|c]opyright' \\([C|c]\\))",
"^(?:COPYRIGHT)",
"^(?:[C|c]opyrighted)",
"^(?:[C|c]opyrighted \\([C|c]\\))",
"^(?:[P|p]ortions [C|c]opyright)",
"^(?:[P|p]ortions \\([C|c]\\))",
"^(?:[P|p]ortions [C|c]opyright \\([C|c]\\))"
).map { it.toRegex() }
private val SINGLE_YEARS_REGEX = "(?=.*)\\b(\\d{4})\\b".toRegex()
private val U_QUOTE_REGEX = "(.*\\b)u'(\\d{4}\\b)".toRegex()
private val YEAR_RANGE_REGEX = "(?=.*)\\b(\\d{4})( *- *)(\\d{4}|\\d{2}|\\d)\\b".toRegex()
/**
* Remove all found years from the [copyrightStatement] and replace them with the [YEAR_PLACEHOLDER]. The replacement is
* not necessary for implementing the needed functionality, but it is helpful for debugging.
*/
private fun replaceYears(copyrightStatement: String): Pair> {
/**
* Replace the first year range in the [copyrightStatement] with the [YEAR_PLACEHOLDER] and return the resulting
* string paired to the set of years.
*/
fun replaceYearRange(copyrightStatement: String): Pair> {
@Suppress("UnsafeCallOnNullableType")
YEAR_RANGE_REGEX.findAll(copyrightStatement).forEach { matchResult ->
val fromGroup = matchResult.groups[1]!!
val separatorGroup = matchResult.groups[2]!!
val toGroup = matchResult.groups[3]!!
val fromYearString = fromGroup.value
val fromYear = fromGroup.value.toInt()
// Handle also the following cases: '2008 - 9' and '2001 - 10'.
val toYear = toGroup.value.let { fromYearRaw ->
"${fromYearString.substring(0, fromYearString.length - fromYearRaw.length)}$fromYearRaw".toInt()
}
if (fromYear <= toYear) {
return Pair(
copyrightStatement
.removeRange(toGroup.range)
.removeRange(separatorGroup.range)
.replaceRange(fromGroup.range, YEAR_PLACEHOLDER),
(fromYear..toYear).toSet()
)
}
}
return Pair(copyrightStatement, emptySet())
}
/**
* Replace all year ranges in the [copyrightStatement] with the [YEAR_PLACEHOLDER] and return the resulting string
* paired to the set of years.
*/
fun replaceAllYearRanges(copyrightStatement: String): Pair> {
val years = mutableSetOf()
var currentStatement = copyrightStatement
while (true) {
val replaceResult = replaceYearRange(currentStatement)
if (replaceResult.second.isEmpty()) {
return Pair(currentStatement, years)
}
years += replaceResult.second
currentStatement = replaceResult.first
}
}
val resultYears = mutableSetOf()
// Fix up strings containing e.g.: 'copyright u'2013'
var currentStatement = copyrightStatement.replace(U_QUOTE_REGEX, "$1$2")
val replaceRangeResult = replaceAllYearRanges(currentStatement)
currentStatement = replaceRangeResult.first
resultYears += replaceRangeResult.second
// Replace comma separated years.
var matchResult = COMMA_SEPARATED_YEARS_REGEX.find(currentStatement)
@Suppress("UnsafeCallOnNullableType")
while (matchResult != null) {
currentStatement = currentStatement.removeRange(matchResult.groups[2]!!.range)
currentStatement = currentStatement.replaceRange(matchResult.groups[1]!!.range, "$YEAR_PLACEHOLDER ")
resultYears += matchResult.groups[1]!!.value.toInt()
matchResult = COMMA_SEPARATED_YEARS_REGEX.find(currentStatement)
}
// Replace single years.
matchResult = SINGLE_YEARS_REGEX.find(currentStatement)
@Suppress("UnsafeCallOnNullableType")
while (matchResult != null) {
currentStatement = currentStatement.replaceRange(matchResult.groups[1]!!.range, YEAR_PLACEHOLDER)
resultYears += matchResult.groups[1]!!.value.toInt()
matchResult = SINGLE_YEARS_REGEX.find(currentStatement)
}
currentStatement = currentStatement.replace("$YEAR_PLACEHOLDER $YEAR_PLACEHOLDER", YEAR_PLACEHOLDER)
return Pair(currentStatement, resultYears)
}
/**
* A copyright statement consists in most cases of three parts: a copyright prefix, years and the owner. For legal
* reasons the prefix part must not be modified at all while adjusting some special characters in the owner part is
* acceptable. Entries can be merged by year as well. The main idea of the algorithm is to process only entries with
* a known copyright prefix. This allows stripping the prefix and processing the remaining string separately and thus
* guarantees that the prefix part is not modified at all.
*
* TODO: Maybe treat URLs similar to years, e.g. entries which differ only in URLs and years can be merged.
*/
object CopyrightStatementsProcessor {
data class Parts(
val prefix: String,
val years: Set,
val owner: String,
val originalStatements: List
) : Comparable {
companion object {
private val COMPARATOR =
compareBy({ it.owner }, { it.years.collapseToRanges().prettyPrintRanges() }, { it.prefix })
}
override fun compareTo(other: Parts) = COMPARATOR.compare(this, other)
override fun toString() =
buildString {
append(prefix)
if (years.isNotEmpty()) {
append(" ")
append(years.collapseToRanges().prettyPrintRanges())
}
if (owner.isNotEmpty()) {
append(" ")
append(owner)
}
}
}
data class Result(
/**
* The copyright statements that were processed by the [CopyrightStatementsProcessor], mapped to the original
* copyright statements. An original statement can be identical to the processed statement if the processor did
* process but not modify it.
*/
@JsonPropertyOrder(alphabetic = true)
@JsonSerialize(contentConverter = StringSortedSetConverter::class)
val processedStatements: Map>,
/**
* The copyright statements that were ignored by the [CopyrightStatementsProcessor].
*/
@JsonSerialize(converter = StringSortedSetConverter::class)
val unprocessedStatements: Set
) {
@get:JsonIgnore
val allStatements by lazy { unprocessedStatements + processedStatements.keys }
}
/**
* Split the [copyrightStatement] into its [Parts], or return null if the [Parts] could not be determined.
*/
fun determineParts(copyrightStatement: String): Parts? {
/**
* Strip the longest [known copyright prefix][KNOWN_PREFIX_REGEX] from [copyrightStatement] and return a pair of
* the copyright statement without the prefix and the prefix that was stripped from it.
*/
fun stripKnownCopyrightPrefix(copyrightStatement: String): Pair {
val copyrightStatementWithoutPrefix = KNOWN_PREFIX_REGEX.map { regex ->
copyrightStatement.replace(regex, "")
}.minByOrNull {
it.length
} ?: return Pair(first = copyrightStatement, second = "")
return Pair(
first = copyrightStatementWithoutPrefix,
second = copyrightStatement.removeSuffix(copyrightStatementWithoutPrefix)
)
}
/**
* Remove all years from the [copyrightStatement] and return the stripped string paired to the set of years.
*/
fun stripYears(copyrightStatement: String): Pair> =
replaceYears(copyrightStatement).let {
it.copy(first = it.first.replace(YEAR_PLACEHOLDER, ""))
}
val prefixStripResult = stripKnownCopyrightPrefix(copyrightStatement)
if (prefixStripResult.second.isEmpty()) return null
val yearsStripResult = stripYears(prefixStripResult.first)
return Parts(
prefix = prefixStripResult.second,
years = yearsStripResult.second,
owner = yearsStripResult.first
.trimStart(*INVALID_OWNER_START_CHARS)
.collapseWhitespace(),
originalStatements = listOf(copyrightStatement)
)
}
/**
* Try to process the [copyrightStatements] into a more condensed form grouped by owner / prefix and with years
* collapsed. The returned [Result] contains successfully processed as well as unprocessed statements.
*/
fun process(copyrightStatements: Collection): Result {
/**
* Return a normalized Copyright owner to group statement parts by.
*/
fun String.toNormalizedOwnerKey() = filter { it !in INVALID_OWNER_KEY_CHARS }.uppercase()
/**
* Group this collection of [Parts] by prefix and owner and return a list of [Parts] with years and original
* statements merged accordingly.
*/
fun Collection.groupByPrefixAndOwner(): List {
val map = mutableMapOf()
forEach { part ->
val key = "${part.prefix}:${part.owner.toNormalizedOwnerKey()}"
map.merge(key, part) { existing, other ->
Parts(
prefix = existing.prefix,
years = existing.years + other.years,
owner = existing.owner,
originalStatements = existing.originalStatements + other.originalStatements
)
}
}
return map.values.toList()
}
val unprocessedStatements = mutableSetOf()
val processableStatements = mutableListOf()
copyrightStatements.distinct().forEach { statement ->
val parts = determineParts(statement)
if (parts != null) {
processableStatements += parts
} else {
unprocessedStatements += statement
}
}
val mergedParts = processableStatements.sorted().groupByPrefixAndOwner()
val processedStatements = mutableMapOf>()
mergedParts.forEach {
if (it.owner.isNotEmpty()) {
val statement = it.toString()
processedStatements[statement] = it.originalStatements.toSet()
}
}
return Result(
processedStatements = processedStatements,
unprocessedStatements = unprocessedStatements
)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy