com.metaeffekt.artifact.analysis.version.token.VersionTokenizer Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2021-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metaeffekt.artifact.analysis.version.token;
import com.metaeffekt.artifact.analysis.version.VersionModifier;
import org.metaeffekt.core.inventory.processor.model.Constants;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
public abstract class VersionTokenizer {
private final static int TOKENIZER_STATE_NONE = 0;
private final static int TOKENIZER_STATE_NUMBER = 1;
private final static int TOKENIZER_STATE_LETTER = 2;
/**
* Regular expression pattern for Git commit hashes.
*
* This pattern matches Git commit hashes that:
*
* - Are preceded by a dash, underscore, or space
* - Contain 7 to 40 hexadecimal digits (a-f, A-F, 0-9)
* - Contain at least one letter (a-f or A-F)
* - Contain at least one digit (0-9)
*
*/
private final static Pattern GIT_HASH_PATTERN = Pattern.compile("[-_ ](?=.*[a-fA-F])(?=.*[0-9])[a-fA-F0-9]{7,40}$");
public static String buildEffectiveVersionString(String version, String update) {
final String effectiveVersion = Constants.ASTERISK.equals(version) || "-".equals(version) ? null : version;
final String effectiveUpdate = Constants.ASTERISK.equals(update) || "-".equals(update) ? null : update;
return effectiveVersion != null && effectiveUpdate != null ? effectiveVersion + "_" + effectiveUpdate : effectiveVersion != null ? effectiveVersion : effectiveUpdate;
}
public static List tokenize(String version, String update) {
final String versionString = buildEffectiveVersionString(version, update);
return tokenize(versionString);
}
public static List tokenize(String versionString) {
final List tokens = new ArrayList<>();
if (versionString == null) {
return tokens;
}
final String gitHashRemovedVersionString;
if (containsGitHash(versionString)) {
gitHashRemovedVersionString = GIT_HASH_PATTERN.matcher(versionString).replaceAll("");
} else {
gitHashRemovedVersionString = versionString;
}
final String versionModifiersPreprocessed = gitHashRemovedVersionString
.replace("release-candidate", "rc");
final String effectiveVersionString = versionModifiersPreprocessed;
final StringBuilder buffer = new StringBuilder();
int state = TOKENIZER_STATE_NONE;
VersionTokenType currentTokenType = VersionTokenType.OTHER;
for (int i = 0; i < effectiveVersionString.length(); i++) {
final char currentChar = effectiveVersionString.charAt(i);
final char previousChar = i > 0 ? effectiveVersionString.charAt(i - 1) : (char) -1;
final char nextChar = i + 1 < effectiveVersionString.length() ? effectiveVersionString.charAt(i + 1) : (char) -1;
final int bufferLength = buffer.length();
if (tokenizerIsValidNumberCharacterForSemVer(bufferLength, state, bufferLength > 0 && buffer.charAt(bufferLength - 1) == '.', currentChar) ||
(currentChar == '.'
&& tokenizerIsValidNumberCharacterForSemVer(bufferLength, state, true, previousChar)
&& tokenizerIsValidNumberCharacterForSemVer(bufferLength, state, true, nextChar))) {
if (state == TOKENIZER_STATE_NUMBER) {
buffer.append(currentChar);
} else {
appendBufferToTokens(buffer, currentTokenType, tokens);
buffer.append(currentChar);
state = TOKENIZER_STATE_NUMBER;
}
currentTokenType = VersionTokenType.NUMBER_OR_SEMVER;
} else if (Character.isLetter(currentChar)) {
if (state == TOKENIZER_STATE_LETTER) {
buffer.append(currentChar);
} else {
appendBufferToTokens(buffer, currentTokenType, tokens);
buffer.append(currentChar);
state = TOKENIZER_STATE_LETTER;
}
currentTokenType = VersionTokenType.STRING;
} else {
appendBufferToTokens(buffer, currentTokenType, tokens);
buffer.append(currentChar);
state = TOKENIZER_STATE_NONE;
if (tokenizerIsSeparator(currentChar)) {
currentTokenType = VersionTokenType.SEPARATOR;
} else {
currentTokenType = VersionTokenType.OTHER;
}
}
}
appendBufferToTokens(buffer, currentTokenType, tokens);
// now locate certain patterns in the tokens and replace them with special tokens:
// - find all "r" tokens before a NUMBER_OR_SEMVER token and replace their value with "rev"
// - find VERSION_MODIFIERS tokens and the token after them and replace them with a single token of type VERSION_MODIFIER
// - find NUMBER_OR_SEMVER tokens that are yyyyMMddXX, where the XX represent optional digits and replace them with a single token of type DATE
// - find all STRING tokens that come after a VERSION_MODIFIER token and append the STRING token to the VERSION_MODIFIER token
// - remove all STRING tokens == "v" that are before a NUMBER_OR_SEMVER token
// - remove all SEPARATOR tokens at the end
// - find chained VERSION_MODIFIERS and join them into a single token by adding the second one to the first one
final List effectiveTokens = new ArrayList<>();
for (int i = 0; i < tokens.size(); i++) {
final VersionToken token = tokens.get(i);
if (token.getType() == VersionTokenType.STRING) {
final String tokenValue = token.getValue();
final String lowercaseTokenValue = tokenValue.toLowerCase();
final VersionToken nextToken = i + 1 < tokens.size() ? tokens.get(i + 1) : null;
final VersionToken nextNonSeparatorToken = findNextNonSeparatorToken(tokens, i);
final VersionToken latestEffectiveToken = effectiveTokens.isEmpty() ? null : effectiveTokens.get(effectiveTokens.size() - 1);
if (lowercaseTokenValue.equals("r") || lowercaseTokenValue.equals("u") || lowercaseTokenValue.equals("t") || lowercaseTokenValue.equals("p") || lowercaseTokenValue.equals("m")) {
final boolean isNextNumberOrDate = nextNonSeparatorToken != null && (nextNonSeparatorToken.getType() == VersionTokenType.NUMBER_OR_SEMVER || nextNonSeparatorToken.getType() == VersionTokenType.DATE);
if (isNextNumberOrDate) {
tokens.remove(i);
if (lowercaseTokenValue.equals("r")) {
tokens.add(i, new VersionToken("rev", VersionTokenType.STRING));
} else if (lowercaseTokenValue.equals("t")) {
tokens.add(i, new VersionToken("trial", VersionTokenType.STRING));
} else if (lowercaseTokenValue.equals("p")) {
tokens.add(i, new VersionToken("patch", VersionTokenType.STRING));
} else if (lowercaseTokenValue.equals("m")) {
tokens.add(i, new VersionToken("milestone", VersionTokenType.STRING));
} else { // lowercaseTokenValue.equals("u")
tokens.add(i, new VersionToken("update", VersionTokenType.STRING));
}
i--;
continue;
}
}
final VersionModifier versionModifier = VersionModifier.fromStringName(lowercaseTokenValue);
if (versionModifier != null) {
if (nextNonSeparatorToken != null) {
if (nextNonSeparatorToken.getType() == VersionTokenType.NUMBER_OR_SEMVER) {
effectiveTokens.add(new VersionToken(versionModifier.getNames()[0], VersionTokenType.VERSION_MODIFIER, nextNonSeparatorToken));
tokens.remove(nextNonSeparatorToken);
continue;
}
}
effectiveTokens.add(new VersionToken(lowercaseTokenValue, VersionTokenType.VERSION_MODIFIER));
continue;
}
if (lowercaseTokenValue.equals("v") && nextToken != null) {
if (nextToken.getType() == VersionTokenType.NUMBER_OR_SEMVER) {
continue;
}
}
if (latestEffectiveToken != null) {
if (latestEffectiveToken.getType() == VersionTokenType.VERSION_MODIFIER) {
latestEffectiveToken.addSubToken(token);
continue;
}
}
}
if (token.getType() == VersionTokenType.NUMBER_OR_SEMVER) {
final String tokenValue = token.getValue();
if (tokenValue.length() >= 8) {
// extract yyyyMMdd part
final String datePart = tokenValue.substring(0, 8);
try {
// if parsing is successful, the date is valid
LocalDate.parse(datePart, DateTimeFormatter.BASIC_ISO_DATE);
effectiveTokens.add(new VersionToken(tokenValue, VersionTokenType.DATE));
continue;
} catch (DateTimeParseException ignored) {
// the date is not valid, ignore this token
}
}
}
if (token.getType() != VersionTokenType.SEPARATOR) {
effectiveTokens.add(token);
}
}
for (int i = effectiveTokens.size() - 1; i >= 0; i--) {
final VersionToken token = effectiveTokens.get(i);
final VersionToken previousToken = i > 0 ? effectiveTokens.get(i - 1) : null;
if (token.getType() == VersionTokenType.VERSION_MODIFIER && previousToken != null && previousToken.getType() == VersionTokenType.VERSION_MODIFIER) {
previousToken.addSubToken(token);
effectiveTokens.remove(i);
}
}
return effectiveTokens;
}
private static boolean containsGitHash(String version) {
return GIT_HASH_PATTERN.matcher(version).find();
}
private static VersionToken findNextNonSeparatorToken(List tokens, int startIndex) {
for (int i = startIndex + 1; i < tokens.size(); i++) {
final VersionToken token = tokens.get(i);
if (token.getType() != VersionTokenType.SEPARATOR) {
return token;
}
}
return null;
}
private final static char[] VERSION_SEPARATORS = new char[]{'.', '-', '_', ' ', '/', '\\', ':', '+', '~', '(', ')', '[', ']', '{', '}'};
private static boolean tokenizerIsSeparator(char currentChar) {
for (char separator : VERSION_SEPARATORS) {
if (currentChar == separator) {
return true;
}
}
return false;
}
private static boolean tokenizerIsValidNumberCharacterForSemVer(int bufferLength, int state, boolean allowXtoMatch, char character) {
return Character.isDigit(character) || (bufferLength > 0 && state == TOKENIZER_STATE_NUMBER && (allowXtoMatch) && character == 'x');
}
private static void appendBufferToTokens(StringBuilder buffer, VersionTokenType type, List tokens) {
final String trimmedBuffer = buffer.toString().trim();
if (!trimmedBuffer.isEmpty()) {
tokens.add(new VersionToken(trimmedBuffer, type));
buffer.setLength(0); // clear the buffer
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy