org.conqat.engine.commons.findings.location.LocationAdjuster Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of teamscale-commons Show documentation
Provides common DTOs for Teamscale
There is a newer version: 2025.1.0
/*
 * Copyright (c) CQSE GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.conqat.engine.commons.findings.location;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.conqat.lib.commons.algo.Diff;
import org.conqat.lib.commons.algo.Diff.Delta;
import org.conqat.lib.commons.region.LineBasedRegion;
import org.conqat.lib.commons.region.Region;
import org.conqat.lib.commons.string.LineOffsetConverter;

/**
 * This class is used for adjusting the offsets used in locations (i.e.
 * subclasses of {@link ElementLocation} for text that is slightly modified. The
 * main use-case is the update of locations where the local (adjusted) text has
 * different line ending, different content due to keyword expansion, or minor
 * local modifications compared to the text on which the analysis was executed
 * (original text).
 *
 * Both the original and adjusted text may have arbitrary line endings.
 *
 * The implementation is based on a token diff, which can lead to minor
 * deviations for offsets that are not aligned with token boundaries. A
 * character diff would be more precise, but is too performance and memory
 * intensive for large files.
 */
public class LocationAdjuster implements ILineAdjuster {

	/**
	 * If the number of tokens in the adjusted region differs by the tokens in the
	 * original region by more than this factor, the mapping is counted as wrong.
	 */
	private static final double LOSS_FACTOR = 2;

	/** Maximal number of tokens in the diff we accept. */
	private static final int MAX_DIFF_SIZE = 5000;

	/**
	 * Pattern defining tokens for the diff. Matches either alphanumeric strings
	 * (typical identifiers), or single non-whitespace characters.
	 */
	private static final Pattern TOKEN_PATTERN = Pattern.compile("[a-zA-Z0-9_]+|\\S");

	/** The tokens of the original string. */
	private final List originalTokens;

	/**
	 * Adjusted tokens corresponding to the {@link #originalTokens}. If there is no
	 * corresponding token, this list contains null at the index. If the content
	 * could not be matched/adjusted at all (too many differences), this field is
	 * null.
	 */
	private final List mappedAdjustedTokens;

	/** Line offset converted for the original text. */
	private final LineOffsetConverter originalLineOffsetConverter;

	/** Line offset converted for the adjusted text. */
	private final LineOffsetConverter adjustedLineOffsetConverter;

	/**
	 * Optional uniform path to correct element locations of
	 * {@link #adjustLocation(ElementLocation)}. Null if no correction will be
	 * performed.
	 */
	private final String adjustedUniformPath;

	/**
	 * Constructor.
	 *
	 * WARNING: Creating a location adjuster is very expensive and should only be
	 * done once per file. In case originalText and adjustedText is identical take a
	 * look at {@link SimpleValidLinesFilter}.
	 *
	 * @param originalText
	 *            the text for which the input locations have been created, i.e. the
	 *            text from the analysis. May be null, in which case no
	 *            adjustment is performed.
	 * @param adjustedText
	 *            the text for which the locations should be adjusted, i.e. the
	 *            local text.
	 * @param adjustedUniformPath
	 *            the adjusted uniform path for adjusted findings.
	 */
	public LocationAdjuster(String originalText, String adjustedText, String adjustedUniformPath) {
		if (originalText == null) {
			originalText = adjustedText;
		}
		this.adjustedUniformPath = adjustedUniformPath;
		originalLineOffsetConverter = new LineOffsetConverter(originalText);
		adjustedLineOffsetConverter = new LineOffsetConverter(adjustedText);
		originalTokens = toTokens(originalText);
		mappedAdjustedTokens = calculateMappedAdjustedTokens(adjustedText, originalTokens);
	}

	/**
	 * Constructor.
	 *
	 * @param originalText
	 *            the text for which the input locations have been created, i.e. the
	 *            text from the analysis.
	 * @param adjustedText
	 *            the text for which the locations should be adjusted, i.e. the
	 *            local text.
	 */
	public LocationAdjuster(String originalText, String adjustedText) {
		this(originalText, adjustedText, null);
	}

	/**
	 * Calculates the #mappedAdjustedTokens based on original tokens and adjusted
	 * text. May return null if adjustment is not possible due too many changes.
	 */
	private static List calculateMappedAdjustedTokens(String adjustedText,
			List originalTokens) {

		List adjustedTokens = toTokens(adjustedText);
		Delta delta = Diff.computeDelta(originalTokens, adjustedTokens, MAX_DIFF_SIZE);

		if (delta.getSize() >= MAX_DIFF_SIZE) {
			return null;
		}

		return calculateMappedAdjustedTokensFromDelta(delta, originalTokens, adjustedTokens);
	}

	/**
	 * Calculates the #mappedAdjustedTokens based on original tokens, adjusted text,
	 * and delta.
	 */
	private static List calculateMappedAdjustedTokensFromDelta(Delta delta,
			List originalTokens, List adjustedTokens) {
		List mappedAdjustedTokens = new ArrayList<>(Collections.nCopies(originalTokens.size(), null));
		int originalIndex = 0;
		int adjustedIndex = 0;
		for (int i = 0; i < delta.getSize(); ++i) {
			int position = delta.getPosition(i);
			if (position > 0) {
				position -= 1;

				while (adjustedIndex < position) {
					mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++));
				}
				adjustedIndex += 1;
			} else {
				position = -position - 1;

				while (originalIndex < position) {
					mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++));
				}
				originalIndex += 1;
			}
		}

		while (originalIndex < originalTokens.size()) {
			mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++));
		}

		return mappedAdjustedTokens;
	}

	/** Splits a string into tokens. */
	private static List toTokens(String s) {
		List tokens = new ArrayList<>();
		Matcher matcher = TOKEN_PATTERN.matcher(s);
		while (matcher.find()) {
			tokens.add(new AdjusterToken(matcher.group(), matcher.start()));
		}
		return tokens;
	}

	/**
	 * Maps a zero-based offset range (both inclusive) to the adjusted string.
	 * Returns null if the region could not be approximately mapped.
	 */
	public Region getAdjustedRegion(int originalStartOffset, int originalEndOffset) {

		if (mappedAdjustedTokens == null) {
			return null;
		}

		Region originalIndexRegion = findOriginalIndexRegion(originalStartOffset, originalEndOffset);
		if (originalIndexRegion.isEmpty()) {
			return null;
		}

		int numOriginalTokens = originalIndexRegion.getLength();
		int numAdjustedTokens = 0;

		AdjusterToken firstAdjustedToken = null;
		AdjusterToken lastAdjustedToken = null;
		for (int i = originalIndexRegion.getStart(); i <= originalIndexRegion.getEnd(); ++i) {
			AdjusterToken adjustedToken = mappedAdjustedTokens.get(i);
			if (adjustedToken != null) {
				numAdjustedTokens += 1;
				if (firstAdjustedToken == null) {
					firstAdjustedToken = adjustedToken;
				}
				lastAdjustedToken = adjustedToken;
			}
		}

		if (firstAdjustedToken == null || lastAdjustedToken == null
				|| LOSS_FACTOR * numAdjustedTokens < numOriginalTokens) {
			return null;
		}

		return new Region(firstAdjustedToken.startOffset, lastAdjustedToken.endOffset);
	}

	/**
	 * Returns the region of indexes in the {@link #originalTokens} contained in the
	 * given offsets.
	 */
	private Region findOriginalIndexRegion(int originalStartOffset, int originalEndOffset) {
		AdjusterToken searchToken = new AdjusterToken(null, originalStartOffset, originalEndOffset);
		int originalStartTokenIndex = Collections.binarySearch(originalTokens, searchToken,
				AdjusterToken.COMPARE_BY_START_OFFSET);
		if (originalStartTokenIndex < 0) {
			originalStartTokenIndex = -originalStartTokenIndex - 1;
		}

		int originalEndTokenIndex = Collections.binarySearch(originalTokens, searchToken,
				AdjusterToken.COMPARE_BY_END_OFFSET);
		if (originalEndTokenIndex < 0) {
			// we want insertion point -1
			originalEndTokenIndex = -originalEndTokenIndex - 2;
		}
		if (originalEndTokenIndex + 1 < originalTokens.size()
				&& originalTokens.get(originalEndTokenIndex + 1).startOffset < originalEndOffset) {
			originalEndTokenIndex += 1;
		}

		return new Region(originalStartTokenIndex, originalEndTokenIndex);
	}

	/**
	 * Returns a new location with adjusted offsets (if necessary). Returns
	 * null if the location cannot be mapped to the adjusted text.
	 */
	public ElementLocation adjustLocation(ElementLocation location) {
		if (location instanceof TextRegionLocation) {
			return adjustLocation((TextRegionLocation) location);
		}

		// other locations do not have offsets, if the uniform path should not
		// be adjusted simply return the original location.
		if (adjustedUniformPath == null) {
			return location;
		}

		if (location instanceof QualifiedNameLocation) {
			return new QualifiedNameLocation(((QualifiedNameLocation) location).getQualifiedName(),
					adjustedUniformPath);
		}

		return new ElementLocation(adjustedUniformPath);
	}

	/**
	 * Returns a new location with adjusted offsets (if necessary). Returns
	 * null if the cannot be mapped to the adjusted text.
	 */
	public TextRegionLocation adjustLocation(TextRegionLocation location) {
		int startOffset = location.getRawStartOffset();
		int endOffset = location.getRawEndOffset();
		if (startOffset < 0) {
			int startLine = location.getRawStartLine();
			if (!originalLineOffsetConverter.isValidLine(startLine)) {
				return null;
			}
			startOffset = originalLineOffsetConverter.getOffset(startLine);
		}
		if (endOffset < 0) {
			int endLine = location.getRawEndLine() + 1;
			if (!originalLineOffsetConverter.isValidLine(endLine)) {
				return null;
			}
			endOffset = originalLineOffsetConverter.getOffset(endLine) - 1;
		}

		Region adjustedOffsets = getAdjustedRegion(startOffset, endOffset);

		if (adjustedOffsets == null || adjustedOffsets.isEmpty()) {
			return null;
		}

		String uniformPath = location.getUniformPath();
		if (adjustedUniformPath != null) {
			uniformPath = adjustedUniformPath;
		}

		int newStartOffset = adjustedOffsets.getStart();
		int newEndOffset = adjustedOffsets.getEnd();
		return new TextRegionLocation(uniformPath, newStartOffset, newEndOffset,
				adjustedLineOffsetConverter.getLine(newStartOffset), adjustedLineOffsetConverter.getLine(newEndOffset));
	}

	/**
	 * Adjusts the location of a single line. This only respects the token part of a
	 * line, i.e. leading and trailing whitespace of a line will be ignored. This
	 * method is robust w.r.t lines numbers that are out of the range of the
	 * original text. In case of such an invalid line, the line is logged as error
	 * to the given logger and null is returned.
	 *
	 * @param line
	 *            the one-based line number of be adjusted.
	 * @param invalidLines
	 *            used for collecting invalid lines.
	 *
	 * @return the one-based lines encoded as a region, as a line may map to
	 *         multiple lines after changing. This may also return null, if no
	 *         non-empty lines could be found that correspond to the input line
	 *         after adjustment.
	 */
	@Override
	public LineBasedRegion adjustLine(int line, Set invalidLines) {
		if (!originalLineOffsetConverter.isValidLine(line) || !originalLineOffsetConverter.isValidLine(line + 1)) {
			invalidLines.add(line);
			return null;
		}
		int originalStartOffset = originalLineOffsetConverter.getOffset(line);
		int originalEndOffset = originalLineOffsetConverter.getOffset(line + 1) - 1;
		Region adjustedOffsets = getAdjustedRegion(originalStartOffset, originalEndOffset);
		if (adjustedOffsets == null) {
			return null;
		}

		int adjustedStartLine = adjustedLineOffsetConverter.getLine(adjustedOffsets.getStart());
		int adjustedEndLine = adjustedLineOffsetConverter.getLine(adjustedOffsets.getEnd());
		return new LineBasedRegion(adjustedStartLine, adjustedEndLine);
	}

	/** Returns the line count of the original text */
	@Override
	public int getOriginalLineCount() {
		return originalLineOffsetConverter.getLineCount();
	}

	/** Simple token representation used in location adjustment. */
	private static class AdjusterToken {

		/** Compares by start offset. */
		private static final Comparator COMPARE_BY_START_OFFSET = Comparator
				.comparingInt(token -> token.startOffset);

		/** Compares by end offset. */
		private static final Comparator COMPARE_BY_END_OFFSET = Comparator
				.comparingInt(token -> token.endOffset);

		/** The text content. */
		private final String text;

		/** The start offset in the text. */
		private final int startOffset;

		/** The inclusive end offset in the text. */
		private final int endOffset;

		/** Constructor. */
		private AdjusterToken(String text, int startOffset) {
			this(text, startOffset, startOffset + text.length() - 1);
		}

		/** Constructor. */
		private AdjusterToken(String text, int startOffset, int endOffset) {
			this.text = text;
			this.startOffset = startOffset;
			this.endOffset = endOffset;
		}

		/** {@inheritDoc} */
		@Override
		public boolean equals(Object obj) {
			return (obj instanceof AdjusterToken) && ((AdjusterToken) obj).text.equals(text);
		}

		/** {@inheritDoc} */
		@Override
		public int hashCode() {
			return text.hashCode();
		}
	}
}