All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.cqse.check.framework.scanner.ambiguous_language.AmbiguousLanguageResolutionUtils Maven / Gradle / Ivy

There is a newer version: 2025.1.0-rc2
Show newest version
package eu.cqse.check.framework.scanner.ambiguous_language;

import static eu.cqse.check.framework.scanner.ELanguage.C;
import static eu.cqse.check.framework.scanner.ELanguage.CPP;
import static eu.cqse.check.framework.scanner.ELanguage.LINE;
import static eu.cqse.check.framework.scanner.ELanguage.MATLAB;
import static eu.cqse.check.framework.scanner.ELanguage.OBJECTIVE_C;
import static eu.cqse.check.framework.scanner.ELanguage.OBJECTIVE_CPP;
import static eu.cqse.check.framework.scanner.ELanguage.VB;

import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.logging.log4j.LogManager;
import org.checkerframework.checker.nullness.qual.NonNull;
import org.conqat.engine.resource.util.UniformPathUtils;
import org.conqat.lib.commons.assertion.CCSMAssert;
import org.conqat.lib.commons.collections.CollectionUtils;
import org.conqat.lib.commons.filesystem.FileExtensions;
import org.conqat.lib.commons.string.StringUtils;

import com.google.common.annotations.VisibleForTesting;

import eu.cqse.check.framework.scanner.ELanguage;

/**
 * Util methods for deciding which language we should use (unless there is an
 * explicit language mapping configured) to process a file with an ambiguous
 * file ending. For example '.h' could be a C/C++ or Objective-C file.
 */
public class AmbiguousLanguageResolutionUtils {

	/**
	 * Priority list of languages that share file extensions. If we see a file with
	 * a shared extension and have both languages in the analysis profile, we need
	 * to look at the text to decide for a language. If this text analysis is still
	 * ambiguous, we just choose the first language from this priority list.
	 * 

* If we are wrong, the user/admin has to manually fix this via the language * mapping option. *

* List semantics: the first Language is the one that will be chosen with the * most priority, the last one with the least priority. */ private static final List AMBIGUOUS_LANGUAGES_PRIORITY_LIST = Arrays.asList(C, CPP, MATLAB, OBJECTIVE_C, OBJECTIVE_CPP); /** Identifiers used in typical Visual Basic files. */ private static final Pattern VISUAL_BASIC_CONTENT_PATTERN = Pattern .compile("\\W(End Module|End Sub|imports|Dim|BEGIN)\\W"); /** * Extensions of files we know to be text files. These should be all lower-case. */ private static final String[] KNOWN_TEXT_FILE_EXTENSIONS = { FileExtensions.DOT_CSV, FileExtensions.DOT_TXT, FileExtensions.DOT_HTML, FileExtensions.DOT_ECUEXTRACT, FileExtensions.DOT_ECUEXTRACT_VARIANT, FileExtensions.DOT_ARXML, FileExtensions.DOT_ECUCONFIG }; private static final Pattern ASCII_CHARACTER_PATTERN = Pattern.compile("\\p{ASCII}"); /** * Pattern that matches on #include and #import directives in C code (import is * also used in the MS variant of C). *

* The directives must be at the beginning of a line (optionally with leading * spaces). */ private static final Pattern OBJECTIVE_C_INCLUDE_MATCHER = Pattern.compile("(^|\\n)\\s*#(include|import)\\s"); /** * Returns the {@link ELanguage} based on the configured languages and the * element name. If no file extension is present we assume the file is a * {@link ELanguage#CPP} header. If the file extensions matches any of the * {@link #KNOWN_TEXT_FILE_EXTENSIONS} we assume the language is * {@link ELanguage#LINE}. Otherwise, the language is determined based on the * file extensions and the configured languages. {@link ELanguage#LINE} is used * as a fallback, unless the file is likely binary in which case an error is * logged and null is returned. */ public static ELanguage getLanguageFromUniformPath(Set configuredLanguages, String uniformPath, String fileContent) { String elementName = UniformPathUtils.getElementName(uniformPath); elementName = elementName.toLowerCase(); if (!elementName.contains(".")) { if (isLikelyBinaryContent(fileContent)) { logErrorAboutBinaryContent(uniformPath); return null; } // no extension, so we guess CPP header return CPP; } if (StringUtils.endsWithOneOf(elementName, KNOWN_TEXT_FILE_EXTENSIONS)) { return LINE; } String extension = UniformPathUtils.getExtension(elementName); CCSMAssert.isNotNull(extension); if (extension.equals(FileExtensions.DOT_CLS)) { if (VISUAL_BASIC_CONTENT_PATTERN.matcher(fileContent).find()) { return VB; } return LINE; } Set languagesForFileExtension = ELanguage.getAllLanguagesForExtension(extension); if (languagesForFileExtension.isEmpty()) { return determineLanguageForUnknownFileExtension(uniformPath, fileContent); } else if (languagesForFileExtension.size() == 1) { return CollectionUtils.getAny(languagesForFileExtension); } else { return decideForLanguage(configuredLanguages, fileContent, languagesForFileExtension); } } private static ELanguage decideForLanguage(Set configuredLanguages, String fileContent, Set languagesForFileExtension) { Set languageCandidates = CollectionUtils.intersectionSet(languagesForFileExtension, configuredLanguages); if (languageCandidates.isEmpty()) { // multiple candidates for file extension but no appropriate language configured // in analysis profile -> choose "first" matching language (always the same // language) return languagesForFileExtension.stream().sorted().findFirst().get(); } return decideForAmbiguousLanguage(languageCandidates, fileContent); } private static ELanguage determineLanguageForUnknownFileExtension(String uniformPath, String fileContent) { if (isLikelyBinaryContent(fileContent)) { logErrorAboutBinaryContent(uniformPath); return null; } return LINE; } private static void logErrorAboutBinaryContent(String uniformPath) { LogManager.getLogger().error("Ignoring file with unsupported extension and likely binary content: " + uniformPath + ". If this file is a valid code file that should be analyzed, please use an explicit language mapping in the connector configuration."); } private static boolean isLikelyBinaryContent(String fileContent) { String contentStart = StringUtils.getFirstCharacters(fileContent, 200); if (contentStart.length() < 200) { // too short for reliable decision return false; } long numAsciiCharacters = 0; Matcher matcher = ASCII_CHARACTER_PATTERN.matcher(contentStart); while (matcher.find()) { numAsciiCharacters++; } return ((double) numAsciiCharacters / contentStart.length() < .75); } /** * Decides the language for cases in which a single file extension is used in * multiple languages. *

* The following file extensions have overlapping languages: *

    *
  • .h: Objective-C++, Objective-C, C++, C
  • *
  • .inc: Objective-C++, C++, C
  • *
  • .inl: Objective-C++, C++
  • *
  • .m: Objective-C, Matlab
  • *
*

* For each file extension we assume the right most language in the listing * above as default, if the language was selected for the project. We then check * whether one of the languages listed before fits better taking into * consideration the selected languages for the project. * * @param languageCandidates * A set of languages which includes only those with an overlap for * the current file's extension taking into consideration the * selected languages for the project. Must not be empty. */ @VisibleForTesting public static ELanguage decideForAmbiguousLanguage(Set languageCandidates, String fileContent) { CCSMAssert.isNotEmpty(languageCandidates, "languageCandidates was empty"); if (languageCandidates.contains(MATLAB) && (languageCandidates.contains(OBJECTIVE_C) || languageCandidates.contains(OBJECTIVE_CPP))) { // The condition has only the Objective-C/Objective-C++ languages since those // have a clash with Matlab on .m files. if (OBJECTIVE_C_INCLUDE_MATCHER.matcher(fileContent).find()) { // If the code contains something that looks like a C include/import directive, // then it is very unlikely that it is matlab code (but we still have to decide // between the C derivatives). languageCandidates.remove(MATLAB); } } EnumSet patternsToScanFor = determinePatternsToScanFor(languageCandidates); EnumSet foundLanguagePatterns = scanCodeForLanguageSpecificPatterns(fileContent, patternsToScanFor); if (foundLanguagePatterns.containsAll(EnumSet.of(OBJECTIVE_C, CPP))) { foundLanguagePatterns.add(OBJECTIVE_CPP); } Set foundLanguageCandidates = CollectionUtils.intersectionSet(languageCandidates, foundLanguagePatterns); if (foundLanguageCandidates.contains(OBJECTIVE_CPP)) { return OBJECTIVE_CPP; } if (foundLanguageCandidates.contains(OBJECTIVE_C)) { return OBJECTIVE_C; } Optional foundLanguage = AMBIGUOUS_LANGUAGES_PRIORITY_LIST.stream() .filter(foundLanguageCandidates::contains).findFirst(); if (foundLanguage.isPresent()) { return foundLanguage.get(); } // We did not find evidence for any of the languages that could aid with the // decision. Now just have to decide based on a hard-coded priority list. foundLanguage = AMBIGUOUS_LANGUAGES_PRIORITY_LIST.stream().filter(languageCandidates::contains).findFirst(); if (foundLanguage.isPresent()) { return foundLanguage.get(); } // Just choose the first candidate. return languageCandidates.stream().findFirst().get(); } /** * Determines for which language patterns we need to search. For example, if * {@link ELanguage#OBJECTIVE_CPP} is a candidate, we need to search for C++ and * Objective-C patterns. */ @NonNull private static EnumSet determinePatternsToScanFor(Set languageCandidates) { EnumSet patternsToScanFor = EnumSet.noneOf(ELanguage.class); if (languageCandidates.contains(CPP) || languageCandidates.contains(OBJECTIVE_CPP)) { patternsToScanFor.add(CPP); } if (languageCandidates.contains(OBJECTIVE_C) || languageCandidates.contains(OBJECTIVE_CPP)) { patternsToScanFor.add(OBJECTIVE_C); } if (languageCandidates.contains(MATLAB)) { patternsToScanFor.add(MATLAB); } return patternsToScanFor; } /** * Returns either {@link ELanguage#CPP}, {@link ELanguage#OBJECTIVE_C}, both or * an empty set based on the given file content. *

* Iterates through each line of the file. For each line, it is checked whether * the current line is a comment or contains actual code. If we are currently in * a comment, the line is skipped. Comments that are between code are filtered * out. The identified code parts of a line are then checked for unique keywords * for C++ and Objective-C. If we identify a keyword for a particular language * we add that language to the identified languages. */ private static EnumSet scanCodeForLanguageSpecificPatterns(String fileContent, EnumSet patternsToSearchFor) { if (patternsToSearchFor.isEmpty()) { return EnumSet.noneOf(ELanguage.class); } EnumSet foundLanguagePatterns = EnumSet.noneOf(ELanguage.class); if (patternsToSearchFor.contains(MATLAB) && MatlabLanguageDetector.containsLikelyMatlabCode(fileContent)) { foundLanguagePatterns.add(MATLAB); } if (patternsToSearchFor.contains(CPP) || patternsToSearchFor.contains(OBJECTIVE_C)) { foundLanguagePatterns .addAll(CLikeLanguageDetector.scanForClikeLanguagePatterns(fileContent, patternsToSearchFor)); } return foundLanguagePatterns; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy