eu.cqse.check.framework.scanner.ambiguous_language.AmbiguousLanguageResolutionUtils Maven / Gradle / Ivy
package eu.cqse.check.framework.scanner.ambiguous_language;
import static eu.cqse.check.framework.scanner.ELanguage.C;
import static eu.cqse.check.framework.scanner.ELanguage.CPP;
import static eu.cqse.check.framework.scanner.ELanguage.KUBERNETES;
import static eu.cqse.check.framework.scanner.ELanguage.LINE;
import static eu.cqse.check.framework.scanner.ELanguage.MATLAB;
import static eu.cqse.check.framework.scanner.ELanguage.OBJECTIVE_C;
import static eu.cqse.check.framework.scanner.ELanguage.OBJECTIVE_CPP;
import static eu.cqse.check.framework.scanner.ELanguage.VB;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.checkerframework.checker.nullness.qual.NonNull;
import org.conqat.engine.resource.util.UniformPathUtils;
import org.conqat.lib.commons.assertion.CCSMAssert;
import org.conqat.lib.commons.collections.CollectionUtils;
import org.conqat.lib.commons.filesystem.FileExtensions;
import org.conqat.lib.commons.string.StringUtils;
import com.google.common.annotations.VisibleForTesting;
import eu.cqse.check.framework.scanner.ELanguage;
/**
* Util methods for deciding which language we should use (unless there is an explicit language
* mapping configured) to process a file with an ambiguous file ending. For example '.h' could be a
* C/C++ or Objective-C file.
*/
public class AmbiguousLanguageResolutionUtils {
/**
* Priority list of languages that share file extensions. If we see a file with a shared extension
* and have both languages in the analysis profile, we need to look at the text to decide for a
* language. If this text analysis is still ambiguous, we just choose the first language from this
* priority list.
*
* If we are wrong, the user/admin has to manually fix this via the language mapping option.
*
* List semantics: the first Language is the one that will be chosen with the most priority, the
* last one with the least priority.
*/
private static final List AMBIGUOUS_LANGUAGES_PRIORITY_LIST = Arrays.asList(C, CPP, MATLAB, OBJECTIVE_C,
OBJECTIVE_CPP, LINE, KUBERNETES);
/** Identifiers used in typical Visual Basic files. */
private static final Pattern VISUAL_BASIC_CONTENT_PATTERN = Pattern
.compile("\\W(End Module|End Sub|imports|Dim|BEGIN)\\W");
/**
* Extensions of files we know to be text files. These should be all lower-case.
*/
private static final String[] KNOWN_TEXT_FILE_EXTENSIONS = { FileExtensions.DOT_CSV, FileExtensions.DOT_TXT,
FileExtensions.DOT_HTML, FileExtensions.DOT_ECUEXTRACT, FileExtensions.DOT_ECUEXTRACT_VARIANT,
FileExtensions.DOT_ARXML, FileExtensions.DOT_ECUCONFIG };
private static final Pattern ASCII_CHARACTER_PATTERN = Pattern.compile("\\p{ASCII}");
/**
* Pattern that matches on #include and #import directives in C code (import is also used in the MS
* variant of C).
*
* The directives must be at the beginning of a line (optionally with leading spaces).
*/
private static final Pattern OBJECTIVE_C_INCLUDE_MATCHER = Pattern.compile("(^|\\n)\\s*#(include|import)\\s");
/**
* Returns the {@link ELanguage} based on the configured languages and the element name. If no file
* extension is present we assume the file is a {@link ELanguage#CPP} header. If the file extensions
* matches any of the {@link #KNOWN_TEXT_FILE_EXTENSIONS} we assume the language is
* {@link ELanguage#LINE}. Otherwise, the language is determined based on the file extensions and
* the configured languages. {@link ELanguage#LINE} is used as a fallback, unless the file is likely
* binary in which case an error is logged and null is returned.
*/
public static ELanguage getLanguageFromUniformPath(Set configuredLanguages, String uniformPath,
String fileContent) {
String elementName = UniformPathUtils.getElementName(uniformPath);
elementName = elementName.toLowerCase();
if (!elementName.contains(".")) {
if (isLikelyBinaryContent(fileContent)) {
logErrorAboutBinaryContent(uniformPath);
return null;
}
// no extension, so we guess CPP header
return CPP;
}
if (StringUtils.endsWithOneOf(elementName, KNOWN_TEXT_FILE_EXTENSIONS)) {
return LINE;
}
String extension = UniformPathUtils.getExtension(elementName);
CCSMAssert.isNotNull(extension);
if (extension.equals(FileExtensions.DOT_CLS)) {
if (VISUAL_BASIC_CONTENT_PATTERN.matcher(fileContent).find()) {
return VB;
}
return LINE;
}
Set languagesForFileExtension = ELanguage.getAllLanguagesForExtension(extension);
if (languagesForFileExtension.isEmpty()) {
return determineLanguageForUnknownFileExtension(uniformPath, fileContent);
} else if (languagesForFileExtension.size() == 1) {
return CollectionUtils.getAny(languagesForFileExtension);
} else {
return decideForLanguage(configuredLanguages, fileContent, languagesForFileExtension);
}
}
private static ELanguage decideForLanguage(Set configuredLanguages, String fileContent,
Set languagesForFileExtension) {
Set languageCandidates = CollectionUtils.intersectionSet(languagesForFileExtension,
configuredLanguages);
if (languageCandidates.isEmpty()) {
// multiple candidates for file extension but no appropriate language configured
// in analysis profile -> choose "first" matching language (always the same
// language)
return languagesForFileExtension.stream().sorted().findFirst()
.orElseThrow(() -> new AssertionError("Expected languagesForFileExtension to not be empty"));
}
return decideForAmbiguousLanguage(languageCandidates, fileContent);
}
private static ELanguage determineLanguageForUnknownFileExtension(String uniformPath, String fileContent) {
if (isLikelyBinaryContent(fileContent)) {
logErrorAboutBinaryContent(uniformPath);
return null;
}
return LINE;
}
private static void logErrorAboutBinaryContent(String uniformPath) {
LogManager.getLogger().error("Ignoring file with unsupported extension and likely binary content: "
+ uniformPath
+ ". If this file is a valid code file that should be analyzed, please use an explicit language mapping in the connector configuration.");
}
private static boolean isLikelyBinaryContent(String fileContent) {
String contentStart = StringUtils.getFirstCharacters(fileContent, 200);
if (contentStart.length() < 200) {
// too short for reliable decision
return false;
}
long numAsciiCharacters = 0;
Matcher matcher = ASCII_CHARACTER_PATTERN.matcher(contentStart);
while (matcher.find()) {
numAsciiCharacters++;
}
return ((double) numAsciiCharacters / contentStart.length() < .75);
}
/**
* Decides the language for cases in which a single file extension is used in multiple languages.
*
* The following file extensions have overlapping languages:
*
* - .h: Objective-C++, Objective-C, C++, C
* - .inc: Objective-C++, C++, C
* - .inl: Objective-C++, C++
* - .m: Objective-C, Matlab
* - .yaml, .yml, .json: Kubernetes, Line
*
*
* For each file extension, we assume the right most language in the listing above as the default if
* the language was selected for the project. We then check whether one of the languages listed
* before fits better taking into consideration the selected languages for the project.
*
* @param languageCandidates
* A set of languages which includes only those with an overlap for the current file's
* extension taking into consideration the selected languages for the project. Must not
* be empty.
*/
@VisibleForTesting
public static ELanguage decideForAmbiguousLanguage(Set languageCandidates, String fileContent) {
CCSMAssert.isNotEmpty(languageCandidates, "languageCandidates was empty");
if (languageCandidates.contains(MATLAB)
&& (languageCandidates.contains(OBJECTIVE_C) || languageCandidates.contains(OBJECTIVE_CPP))) {
// The condition has only the Objective-C/Objective-C++ languages since those
// have a clash with Matlab on .m files.
if (OBJECTIVE_C_INCLUDE_MATCHER.matcher(fileContent).find()) {
// If the code contains something that looks like a C include/import directive,
// then it is very unlikely that it is matlab code (but we still have to decide
// between the C derivatives).
languageCandidates.remove(MATLAB);
}
}
EnumSet patternsToScanFor = determinePatternsToScanFor(languageCandidates);
EnumSet foundLanguagePatterns = scanCodeForLanguageSpecificPatterns(fileContent, patternsToScanFor);
if (foundLanguagePatterns.containsAll(EnumSet.of(OBJECTIVE_C, CPP))) {
foundLanguagePatterns.add(OBJECTIVE_CPP);
}
Set foundLanguageCandidates = CollectionUtils.intersectionSet(languageCandidates,
foundLanguagePatterns);
if (foundLanguageCandidates.contains(OBJECTIVE_CPP)) {
return OBJECTIVE_CPP;
}
if (foundLanguageCandidates.contains(OBJECTIVE_C)) {
return OBJECTIVE_C;
}
Optional foundLanguage = AMBIGUOUS_LANGUAGES_PRIORITY_LIST.stream()
.filter(foundLanguageCandidates::contains).findFirst();
if (foundLanguage.isPresent()) {
return foundLanguage.get();
}
// We did not find evidence for any of the languages that could aid with the
// decision. Now just have to decide based on a hard-coded priority list.
foundLanguage = AMBIGUOUS_LANGUAGES_PRIORITY_LIST.stream().filter(languageCandidates::contains).findFirst();
if (foundLanguage.isPresent()) {
return foundLanguage.get();
}
// Just choose the first candidate.
return languageCandidates.stream().findFirst().get();
}
/**
* Determines for which language patterns we need to search. For example, if
* {@link ELanguage#OBJECTIVE_CPP} is a candidate, we need to search for C++ and Objective-C
* patterns.
*/
@NonNull
private static EnumSet determinePatternsToScanFor(Set languageCandidates) {
EnumSet patternsToScanFor = EnumSet.noneOf(ELanguage.class);
if (languageCandidates.contains(CPP) || languageCandidates.contains(OBJECTIVE_CPP)) {
patternsToScanFor.add(CPP);
}
if (languageCandidates.contains(OBJECTIVE_C) || languageCandidates.contains(OBJECTIVE_CPP)) {
patternsToScanFor.add(OBJECTIVE_C);
}
if (languageCandidates.contains(MATLAB)) {
patternsToScanFor.add(MATLAB);
}
if (languageCandidates.contains(KUBERNETES)) {
patternsToScanFor.add(KUBERNETES);
}
return patternsToScanFor;
}
/**
* Returns either {@link ELanguage#CPP}, {@link ELanguage#OBJECTIVE_C}, both or an empty set based
* on the given file content.
*
* Iterates through each line of the file. For each line, it is checked whether the current line is
* a comment or contains actual code. If we are currently in a comment, the line is skipped.
* Comments that are between code are filtered out. The identified code parts of a line are then
* checked for unique keywords for C++ and Objective-C. If we identify a keyword for a particular
* language we add that language to the identified languages.
*/
private static EnumSet scanCodeForLanguageSpecificPatterns(String fileContent,
EnumSet patternsToSearchFor) {
if (patternsToSearchFor.isEmpty()) {
return EnumSet.noneOf(ELanguage.class);
}
EnumSet foundLanguagePatterns = EnumSet.noneOf(ELanguage.class);
if (patternsToSearchFor.contains(MATLAB) && MatlabLanguageDetector.containsLikelyMatlabCode(fileContent)) {
foundLanguagePatterns.add(MATLAB);
}
if (patternsToSearchFor.contains(CPP) || patternsToSearchFor.contains(OBJECTIVE_C)) {
foundLanguagePatterns
.addAll(CLikeLanguageDetector.scanForClikeLanguagePatterns(fileContent, patternsToSearchFor));
}
if (patternsToSearchFor.contains(KUBERNETES)
&& KubernetesLanguageDetector.isLikelyKubernetesContent(fileContent)) {
foundLanguagePatterns.add(KUBERNETES);
}
return foundLanguagePatterns;
}
}