eu.cqse.check.framework.scanner.ambiguous_language.AmbiguousLanguageResolutionUtils Maven / Gradle / Ivy
Show all versions of teamscale-commons Show documentation
package eu.cqse.check.framework.scanner.ambiguous_language;
import static eu.cqse.check.framework.scanner.ELanguage.C;
import static eu.cqse.check.framework.scanner.ELanguage.CPP;
import static eu.cqse.check.framework.scanner.ELanguage.LINE;
import static eu.cqse.check.framework.scanner.ELanguage.MATLAB;
import static eu.cqse.check.framework.scanner.ELanguage.OBJECTIVE_C;
import static eu.cqse.check.framework.scanner.ELanguage.OBJECTIVE_CPP;
import static eu.cqse.check.framework.scanner.ELanguage.VB;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.conqat.engine.resource.util.UniformPathUtils;
import org.conqat.lib.commons.assertion.CCSMAssert;
import org.conqat.lib.commons.collections.CollectionUtils;
import org.conqat.lib.commons.filesystem.FileExtensions;
import org.conqat.lib.commons.string.StringUtils;
import com.google.common.annotations.VisibleForTesting;
import eu.cqse.check.framework.scanner.ELanguage;
/**
* Util methods for deciding which language we should use (unless there is an
* explicit language mapping configured) to process a file with an ambiguous
* file ending. For example '.h' could be a C/C++ or Objective-C file.
*/
public class AmbiguousLanguageResolutionUtils {
/**
* Priority list of languages that share file extensions. If we see a file with
* a shared extension and have both languages in the analysis profile, we need
* to look at the text to decide for a language. If this text analysis is still
* ambiguous, we just choose the first language from this priority list.
*
* If we are wrong, the user/admin has to manually fix this via the language
* mapping option.
*
* List semantics: the first Language is the one that will be chosen with the
* most priority, the last one with the least priority.
*/
private static final List AMBIGUOUS_LANGUAGES_PRIORITY_LIST = Arrays.asList(C, CPP, OBJECTIVE_C,
OBJECTIVE_CPP, MATLAB);
/** Identifiers used in typical Visual Basic files. */
private static final Pattern VISUAL_BASIC_CONTENT_PATTERN = Pattern
.compile("\\W(End Module|End Sub|imports|Dim|BEGIN)\\W");
/**
* Extensions of files we know to be text files. These should be all lower-case.
*/
private static final String[] KNOWN_TEXT_FILE_EXTENSIONS = { FileExtensions.DOT_CSV, FileExtensions.DOT_TXT,
FileExtensions.DOT_HTML, FileExtensions.DOT_ECUEXTRACT, FileExtensions.DOT_ECUEXTRACT_VARIANT,
FileExtensions.DOT_ARXML, FileExtensions.DOT_ECUCONFIG };
private static final Pattern ASCII_CHARACTER_PATTERN = Pattern.compile("\\p{ASCII}");
/**
* Returns the {@link ELanguage} based on the configured languages and the
* element name. If no file extension is present we assume the file is a
* {@link ELanguage#CPP} header. If the file extensions matches any of the
* {@link #KNOWN_TEXT_FILE_EXTENSIONS} we assume the language is
* {@link ELanguage#LINE}. Otherwise, the language is determined based on the
* file extensions and the configured languages. {@link ELanguage#LINE} is used
* as a fallback, unless the file is likely binary in which case an error is
* logged and null is returned.
*/
public static ELanguage getLanguageFromUniformPath(Set configuredLanguages, String uniformPath,
String fileContent) {
String elementName = UniformPathUtils.getElementName(uniformPath);
elementName = elementName.toLowerCase();
if (!elementName.contains(".")) {
// no extension, so we guess CPP header
return CPP;
}
if (StringUtils.endsWithOneOf(elementName, KNOWN_TEXT_FILE_EXTENSIONS)) {
return LINE;
}
String extension = UniformPathUtils.getExtension(elementName);
CCSMAssert.isNotNull(extension);
if (extension.equals(FileExtensions.DOT_CLS)) {
if (VISUAL_BASIC_CONTENT_PATTERN.matcher(fileContent).find()) {
return VB;
}
return LINE;
}
Set languagesForFileExtension = ELanguage.getAllLanguagesForExtension(extension);
if (languagesForFileExtension.isEmpty()) {
return determineLanguageForUnknownFileExtension(uniformPath, fileContent);
} else if (languagesForFileExtension.size() == 1) {
return CollectionUtils.getAny(languagesForFileExtension);
} else {
Set languageCandidates = CollectionUtils.intersectionSet(languagesForFileExtension,
configuredLanguages);
if (languageCandidates.isEmpty()) {
// multiple candidates for file extension but no appropriate language configured
// in analysis profile -> choose "first" matching language (always the same
// language)
return languagesForFileExtension.stream().sorted().findFirst().get();
}
return decideForAmbiguousLanguage(languageCandidates, fileContent);
}
}
private static ELanguage determineLanguageForUnknownFileExtension(String uniformPath, String fileContent) {
if (isLikelyBinaryContent(fileContent)) {
LogManager.getLogger().error("Ignoring file with unsupported extension and likely binary content: "
+ uniformPath
+ ". If this file is a valid code file that should be analyzed, please use an explicit language mapping in the connector configuration.");
return null;
}
return LINE;
}
private static boolean isLikelyBinaryContent(String fileContent) {
String contentStart = StringUtils.getFirstCharacters(fileContent, 200);
if (contentStart.length() < 200) {
// too short for reliable decision
return false;
}
long numAsciiCharacters = 0;
Matcher matcher = ASCII_CHARACTER_PATTERN.matcher(contentStart);
while (matcher.find()) {
numAsciiCharacters++;
}
return ((double) numAsciiCharacters / contentStart.length() < .75);
}
/**
* Decides the language for cases in which a single file extension is used in
* multiple languages.
*
* The following file extensions have overlapping languages:
*
* - .h: Objective-C++, Objective-C, C++, C
* - .inc: Objective-C++, C++, C
* - .inl: Objective-C++, C++
* - .m: Objective-C, Matlab
*
*
* For each file extension we assume the right most language in the listing
* above as default, if the language was selected for the project. We then check
* whether one of the languages listed before fits better taking into
* consideration the selected languages for the project.
*
* @param languageCandidates
* A set of languages which includes only those with an overlap for
* the current file's extension taking into consideration the
* selected languages for the project. Must not be empty.
*/
@VisibleForTesting
public static ELanguage decideForAmbiguousLanguage(Set languageCandidates, String fileContent) {
CCSMAssert.isNotEmpty(languageCandidates, "languageCandidates was empty");
EnumSet patternsToScanFor = EnumSet.noneOf(ELanguage.class);
if (languageCandidates.contains(CPP) || languageCandidates.contains(OBJECTIVE_CPP)) {
patternsToScanFor.add(CPP);
}
if (languageCandidates.contains(OBJECTIVE_C) || languageCandidates.contains(OBJECTIVE_CPP)) {
patternsToScanFor.add(OBJECTIVE_C);
}
if (languageCandidates.contains(MATLAB)) {
patternsToScanFor.add(MATLAB);
}
EnumSet foundLanguagePatterns = scanCodeForLanguageSpecificPatterns(fileContent, patternsToScanFor);
if (foundLanguagePatterns.containsAll(EnumSet.of(OBJECTIVE_C, CPP))) {
foundLanguagePatterns.add(OBJECTIVE_CPP);
}
Set foundLanguageCandidates = CollectionUtils.intersectionSet(languageCandidates,
foundLanguagePatterns);
if (foundLanguageCandidates.contains(OBJECTIVE_CPP)) {
return OBJECTIVE_CPP;
}
Optional foundLanguage = AMBIGUOUS_LANGUAGES_PRIORITY_LIST.stream()
.filter(foundLanguageCandidates::contains).findFirst();
if (foundLanguage.isPresent()) {
return foundLanguage.get();
}
// We did not find evidence for any of the languages that could aid with the
// decision. Now just have to decide based on a hard-coded priority list.
foundLanguage = AMBIGUOUS_LANGUAGES_PRIORITY_LIST.stream().filter(languageCandidates::contains).findFirst();
if (foundLanguage.isPresent()) {
return foundLanguage.get();
}
// Just choose the first candidate.
return languageCandidates.stream().findFirst().get();
}
/**
* Returns either {@link ELanguage#CPP}, {@link ELanguage#OBJECTIVE_C}, both or
* an empty set based on the given file content.
*
* Iterates through each line of the file. For each line, it is checked whether
* the current line is a comment or contains actual code. If we are currently in
* a comment, the line is skipped. Comments that are between code are filtered
* out. The identified code parts of a line are then checked for unique keywords
* for C++ and Objective-C. If we identify a keyword for a particular language
* we add that language to the identified languages.
*
* We stop iterating, if we could identify both {@link ELanguage#CPP} and
* {@link ELanguage#OBJECTIVE_C} or if we only need to check for one language
* and could identify it.
*/
private static EnumSet scanCodeForLanguageSpecificPatterns(String fileContent,
EnumSet patternsToSearchFor) {
if (patternsToSearchFor.isEmpty()) {
return EnumSet.noneOf(ELanguage.class);
}
EnumSet foundLanguagePatterns = EnumSet.noneOf(ELanguage.class);
if (patternsToSearchFor.contains(MATLAB) && MatlabLanguageDetector.containsLikelyMatlabCode(fileContent)) {
foundLanguagePatterns.add(MATLAB);
}
if (patternsToSearchFor.contains(CPP) || patternsToSearchFor.contains(OBJECTIVE_C)) {
foundLanguagePatterns
.addAll(CLikeLanguageDetector.scanForClikeLanguagePatterns(fileContent, patternsToSearchFor));
}
return foundLanguagePatterns;
}
}