All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.cqse.check.framework.scanner.AmbiguousLanguageResolutionUtils Maven / Gradle / Ivy

package eu.cqse.check.framework.scanner;

import java.util.EnumSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.logging.log4j.LogManager;
import org.conqat.engine.resource.util.UniformPathUtils;
import org.conqat.lib.commons.assertion.CCSMAssert;
import org.conqat.lib.commons.collections.CollectionUtils;
import org.conqat.lib.commons.filesystem.FileExtensions;
import org.conqat.lib.commons.string.StringUtils;

/**
 * Util methods for deciding which language we should use (unless there is an
 * explicit language mapping configured) to process a file with an ambiguous
 * file ending. For example '.h' could be a C/C++ or Objective-C file.
 */
public class AmbiguousLanguageResolutionUtils {

	/**
	 * C and Objective-C have an overlap on the file endings ".c", ".h", and ".cpp".
	 */
	private static final EnumSet OBJECTIVE_C_AND_CPP = EnumSet.of(ELanguage.OBJECTIVE_C, ELanguage.CPP);

	/**
	 * Objective-C and Matlab have an overlap on the file ending ".m".
	 */
	private static final EnumSet OBJECTIVE_C_AND_MATLAB = EnumSet.of(ELanguage.OBJECTIVE_C,
			ELanguage.MATLAB);
	/**
	 * Keywords in code that indicate Objective-C code uniquely when both
	 * Objective-C and another language with overlapping file endings like C/C++ or
	 * Matlab are possible. Note that there is also an #import
	 * statement in C++ to include type libraries. However, for projects with both
	 * C/C++ and Objective-C it is extremely likely that this is still an
	 * Objective-C file since imports of type libraries are rare in practice
	 * especially for projects using Objective-C. Therefore, we include the
	 * {@code #import} as a common Objective-C keyword.
	 */
	private static final String[] OBJECTIVE_C_COMMON_KEYWORDS = { "@end", "@interface", "@property", "@protocol",
			"@import", "#import", "@class" };
	/**
	 * Pattern to detect types that, if found, are very likely used in Objective-C.
	 */
	private static final Pattern OBJECTIVE_C_COMMON_TYPES_HINT_PATTERN = Pattern
			.compile("\\W(NSString|NSMutableString|NSMutableArray|NSSet|NSDictionary|NSMutableDictionary|NSArray)\\W");

	/** Identifiers used in typical Visual Basic files. */
	private static final Pattern VISUAL_BASIC_CONTENT_PATTERN = Pattern
			.compile("\\W(End Module|End Sub|imports|Dim|BEGIN)\\W");

	/**
	 * Extensions of files we know to be text files. These should be all lower-case.
	 */
	private static final String[] KNOWN_TEXT_FILE_EXTENSIONS = { FileExtensions.DOT_CSV, FileExtensions.DOT_TXT,
			FileExtensions.DOT_HTML, FileExtensions.DOT_ECUEXTRACT, FileExtensions.DOT_ECUEXTRACT_VARIANT,
			FileExtensions.DOT_ARXML, FileExtensions.DOT_ECUCONFIG };
	private static final Pattern ASCII_CHARACTER_PATTERN = Pattern.compile("\\p{ASCII}");

	/**
	 * Returns the {@link ELanguage} based on the configured languages and the
	 * element name. If no file extension is present we assume the file is a
	 * {@link ELanguage#CPP} header. If the file extensions matches any of the
	 * {@link #KNOWN_TEXT_FILE_EXTENSIONS} we assume the language is
	 * {@link ELanguage#LINE}. Otherwise, the language is determined based on the
	 * file extensions and the configured languages. {@link ELanguage#LINE} is used
	 * as a fallback, unless the file is likely binary in which case an error is
	 * logged and null is returned.
	 */
	public static ELanguage getLanguageFromUniformPath(Set configuredLanguages, String uniformPath,
			String textContent) {
		String elementName = UniformPathUtils.getElementName(uniformPath);
		elementName = elementName.toLowerCase();

		if (!elementName.contains(".")) {
			// no extension, so we guess CPP header
			return ELanguage.CPP;
		}

		if (StringUtils.endsWithOneOf(elementName, KNOWN_TEXT_FILE_EXTENSIONS)) {
			return ELanguage.LINE;
		}

		String extension = UniformPathUtils.getExtension(elementName);
		CCSMAssert.isNotNull(extension);

		if (extension.equals(FileExtensions.DOT_CLS)) {
			if (VISUAL_BASIC_CONTENT_PATTERN.matcher(textContent).find()) {
				return ELanguage.VB;
			}
			return ELanguage.LINE;
		}

		Set languages = ELanguage.getAllLanguagesForExtension(extension);
		if (languages.isEmpty()) {
			return determineLanguageForUnknownFileExtension(uniformPath, textContent);
		} else if (languages.size() == 1) {
			return CollectionUtils.getAny(languages);
		} else {
			Set languageCandidates = CollectionUtils.intersectionSet(languages, configuredLanguages);
			if (!languageCandidates.isEmpty()) {
				return decideForLanguage(languageCandidates, textContent);
			}
			// multiple candidates for file extension but no appropriate language configured
			// in analysis profile -> choose "first" matching language
			return ELanguage.fromPath(elementName);
		}
	}

	private static ELanguage determineLanguageForUnknownFileExtension(String uniformPath, String textContent) {
		if (isLikelyBinaryContent(textContent)) {
			LogManager.getLogger().error("Ignoring file with unsupported extension and likely binary content: "
					+ uniformPath
					+ ". If this file is a valid code file that should be analyzed, please use an explicit language mapping in the connector configuration.");
			return null;
		}
		return ELanguage.LINE;
	}

	private static boolean isLikelyBinaryContent(String textContent) {
		String contentStart = StringUtils.getFirstCharacters(textContent, 200);
		if (contentStart.length() < 200) {
			// too short for reliable decision
			return false;
		}

		long numAsciiCharacters = 0;
		Matcher matcher = ASCII_CHARACTER_PATTERN.matcher(contentStart);
		while (matcher.find()) {
			numAsciiCharacters++;
		}

		return ((double) numAsciiCharacters / contentStart.length() < .75);
	}

	/**
	 * Decides the language for cases in which a single extension is used in
	 * multiple languages.
	 *
	 * @param languageCandidates
	 *            A set of languages which includes only those with an overlap for
	 *            the current file's extension.
	 */
	private static ELanguage decideForLanguage(Set languageCandidates, String textContent) {
		// overlap in .h, .c, and .cpp files
		if (languageCandidates.equals(OBJECTIVE_C_AND_CPP)) {
			if (isLikelyObjectiveCCode(textContent)) {
				return ELanguage.OBJECTIVE_C;
			}
			return ELanguage.CPP;
		}

		// overlap in .m files
		if (languageCandidates.equals(OBJECTIVE_C_AND_MATLAB)) {
			if (isLikelyObjectiveCCode(textContent)) {
				return ELanguage.OBJECTIVE_C;
			}
			return ELanguage.MATLAB;
		}

		return CollectionUtils.getAny(languageCandidates);
	}

	/**
	 * Returns true if the file content likely represents Objective-C code. This
	 * heuristic is only to resolve conflicts between Objective-C and C++ header
	 * files using the *.h file extension as well as Objective-C and Matlab source
	 * files using the *.m file extension and also in some checks.
	 * 

* We use some indicator keywords that are unique for Objective-C (@...). As * there are headers which do not contain these, we additionally use the #import * directive. While, theoretically, #import can also occur in C++ (explanation) * it is rather uncommon, and we only apply this in mixed projects using both * C++ and ObjC, so the heuristic should be fine. See * TS-24483. */ public static boolean isLikelyObjectiveCCode(String textContent) { return StringUtils.containsOneOf(textContent, OBJECTIVE_C_COMMON_KEYWORDS) || OBJECTIVE_C_COMMON_TYPES_HINT_PATTERN.matcher(textContent).find(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy