
eu.cqse.check.framework.scanner.AmbiguousLanguageResolutionUtils Maven / Gradle / Ivy
package eu.cqse.check.framework.scanner;
import java.util.EnumSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.conqat.engine.resource.util.UniformPathUtils;
import org.conqat.lib.commons.assertion.CCSMAssert;
import org.conqat.lib.commons.collections.CollectionUtils;
import org.conqat.lib.commons.filesystem.FileExtensions;
import org.conqat.lib.commons.string.StringUtils;
/**
* Util methods for deciding which language we should use (unless there is an
* explicit language mapping configured) to process a file with an ambiguous
* file ending. For example '.h' could be a C/C++ or Objective-C file.
*/
public class AmbiguousLanguageResolutionUtils {
/**
* C and Objective-C have an overlap on the file endings ".c", ".h", and ".cpp".
*/
private static final EnumSet OBJECTIVE_C_AND_CPP = EnumSet.of(ELanguage.OBJECTIVE_C, ELanguage.CPP);
/**
* Objective-C and Matlab have an overlap on the file ending ".m".
*/
private static final EnumSet OBJECTIVE_C_AND_MATLAB = EnumSet.of(ELanguage.OBJECTIVE_C,
ELanguage.MATLAB);
/**
* Keywords in code that indicate Objective-C code uniquely when both
* Objective-C and another language with overlapping file endings like C/C++ or
* Matlab are possible. Note that there is also an #import
* statement in C++ to include type libraries. However, for projects with both
* C/C++ and Objective-C it is extremely likely that this is still an
* Objective-C file since imports of type libraries are rare in practice
* especially for projects using Objective-C. Therefore, we include the
* {@code #import} as a common Objective-C keyword.
*/
private static final String[] OBJECTIVE_C_COMMON_KEYWORDS = { "@end", "@interface", "@property", "@protocol",
"@import", "#import", "@class" };
/**
* Pattern to detect types that, if found, are very likely used in Objective-C.
*/
private static final Pattern OBJECTIVE_C_COMMON_TYPES_HINT_PATTERN = Pattern
.compile("\\W(NSString|NSMutableString|NSMutableArray|NSSet|NSDictionary|NSMutableDictionary|NSArray)\\W");
/** Identifiers used in typical Visual Basic files. */
private static final Pattern VISUAL_BASIC_CONTENT_PATTERN = Pattern
.compile("\\W(End Module|End Sub|imports|Dim|BEGIN)\\W");
/**
* Extensions of files we know to be text files. These should be all lower-case.
*/
private static final String[] KNOWN_TEXT_FILE_EXTENSIONS = { FileExtensions.DOT_CSV, FileExtensions.DOT_TXT,
FileExtensions.DOT_HTML, FileExtensions.DOT_ECUEXTRACT, FileExtensions.DOT_ECUEXTRACT_VARIANT,
FileExtensions.DOT_ARXML, FileExtensions.DOT_ECUCONFIG };
private static final Pattern ASCII_CHARACTER_PATTERN = Pattern.compile("\\p{ASCII}");
/**
* Returns the {@link ELanguage} based on the configured languages and the
* element name. If no file extension is present we assume the file is a
* {@link ELanguage#CPP} header. If the file extensions matches any of the
* {@link #KNOWN_TEXT_FILE_EXTENSIONS} we assume the language is
* {@link ELanguage#LINE}. Otherwise, the language is determined based on the
* file extensions and the configured languages. {@link ELanguage#LINE} is used
* as a fallback, unless the file is likely binary in which case an error is
* logged and null is returned.
*/
public static ELanguage getLanguageFromUniformPath(Set configuredLanguages, String uniformPath,
String textContent) {
String elementName = UniformPathUtils.getElementName(uniformPath);
elementName = elementName.toLowerCase();
if (!elementName.contains(".")) {
// no extension, so we guess CPP header
return ELanguage.CPP;
}
if (StringUtils.endsWithOneOf(elementName, KNOWN_TEXT_FILE_EXTENSIONS)) {
return ELanguage.LINE;
}
String extension = UniformPathUtils.getExtension(elementName);
CCSMAssert.isNotNull(extension);
if (extension.equals(FileExtensions.DOT_CLS)) {
if (VISUAL_BASIC_CONTENT_PATTERN.matcher(textContent).find()) {
return ELanguage.VB;
}
return ELanguage.LINE;
}
Set languages = ELanguage.getAllLanguagesForExtension(extension);
if (languages.isEmpty()) {
return determineLanguageForUnknownFileExtension(uniformPath, textContent);
} else if (languages.size() == 1) {
return CollectionUtils.getAny(languages);
} else {
Set languageCandidates = CollectionUtils.intersectionSet(languages, configuredLanguages);
if (!languageCandidates.isEmpty()) {
return decideForLanguage(languageCandidates, textContent);
}
// multiple candidates for file extension but no appropriate language configured
// in analysis profile -> choose "first" matching language
return ELanguage.fromPath(elementName);
}
}
private static ELanguage determineLanguageForUnknownFileExtension(String uniformPath, String textContent) {
if (isLikelyBinaryContent(textContent)) {
LogManager.getLogger().error("Ignoring file with unsupported extension and likely binary content: "
+ uniformPath
+ ". If this file is a valid code file that should be analyzed, please use an explicit language mapping in the connector configuration.");
return null;
}
return ELanguage.LINE;
}
private static boolean isLikelyBinaryContent(String textContent) {
String contentStart = StringUtils.getFirstCharacters(textContent, 200);
if (contentStart.length() < 200) {
// too short for reliable decision
return false;
}
long numAsciiCharacters = 0;
Matcher matcher = ASCII_CHARACTER_PATTERN.matcher(contentStart);
while (matcher.find()) {
numAsciiCharacters++;
}
return ((double) numAsciiCharacters / contentStart.length() < .75);
}
/**
* Decides the language for cases in which a single extension is used in
* multiple languages.
*
* @param languageCandidates
* A set of languages which includes only those with an overlap for
* the current file's extension.
*/
private static ELanguage decideForLanguage(Set languageCandidates, String textContent) {
// overlap in .h, .c, and .cpp files
if (languageCandidates.equals(OBJECTIVE_C_AND_CPP)) {
if (isLikelyObjectiveCCode(textContent)) {
return ELanguage.OBJECTIVE_C;
}
return ELanguage.CPP;
}
// overlap in .m files
if (languageCandidates.equals(OBJECTIVE_C_AND_MATLAB)) {
if (isLikelyObjectiveCCode(textContent)) {
return ELanguage.OBJECTIVE_C;
}
return ELanguage.MATLAB;
}
return CollectionUtils.getAny(languageCandidates);
}
/**
* Returns true if the file content likely represents Objective-C code. This
* heuristic is only to resolve conflicts between Objective-C and C++ header
* files using the *.h file extension as well as Objective-C and Matlab source
* files using the *.m file extension and also in some checks.
*
* We use some indicator keywords that are unique for Objective-C (@...). As
* there are headers which do not contain these, we additionally use the #import
* directive. While, theoretically, #import can also occur in C++ (explanation)
* it is rather uncommon, and we only apply this in mixed projects using both
* C++ and ObjC, so the heuristic should be fine. See
* TS-24483.
*/
public static boolean isLikelyObjectiveCCode(String textContent) {
return StringUtils.containsOneOf(textContent, OBJECTIVE_C_COMMON_KEYWORDS)
|| OBJECTIVE_C_COMMON_TYPES_HINT_PATTERN.matcher(textContent).find();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy