org.conqat.engine.commons.findings.location.LocationAdjuster Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of teamscale-commons Show documentation
Provides common DTOs for Teamscale
The newest version!
/*
 * Copyright (c) CQSE GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.conqat.engine.commons.findings.location;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.function.Supplier;

import org.checkerframework.checker.nullness.qual.Nullable;
import org.conqat.lib.commons.algo.Diff;
import org.conqat.lib.commons.algo.Diff.Delta;
import org.conqat.lib.commons.region.LineBasedRegion;
import org.conqat.lib.commons.region.Region;
import org.conqat.lib.commons.string.LineOffsetConverter;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Suppliers;

/**
 * This class is used for adjusting the offsets used in locations (i.e. subclasses of
 * {@link ElementLocation} for text that is slightly modified. The main use-case is the update of
 * locations where the local (adjusted) text has different line ending, different content due to
 * keyword expansion, or minor local modifications compared to the text on which the analysis was
 * executed (original text).
 * 
 * Both the original and adjusted text may have arbitrary line endings.
 * 

 * The implementation is based on a token diff, which can lead to minor deviations for offsets that
 * are not aligned with token boundaries. A character diff would be more precise, but is too
 * performance and memory intensive for large files.
 */
public class LocationAdjuster implements ILineAdjuster {

	/**
	 * If the number of tokens in the adjusted region differs by the tokens in the original region by
	 * more than this factor, the mapping is counted as wrong.
	 */
	private static final double LOSS_FACTOR = 2;

	/** Maximal number of tokens in the diff we accept. */
	private static final int MAX_DIFF_SIZE = 5000;

	/** The tokens of the original string. */
	private final Supplier> lazyOriginalTokens;

	/**
	 * Adjusted tokens corresponding to the {@link #lazyOriginalTokens}. If there is no corresponding
	 * token, this list contains null at the index. If the content could not be matched/adjusted at all
	 * (too many differences), this field is null.
	 */
	private final Supplier> lazyMappedAdjustedTokens;

	/** Line offset converted for the original text. */
	private final LineOffsetConverter originalLineOffsetConverter;

	/** Line offset converted for the adjusted text. */
	private final LineOffsetConverter adjustedLineOffsetConverter;

	/**
	 * Optional uniform path to correct element locations of {@link #adjustLocation(ElementLocation)}.
	 * Null if no correction will be performed.
	 */
	private final String adjustedUniformPath;

	/**
	 * Constructor.
	 * 

	 * WARNING: Creating a location adjuster is very expensive and should only be done once per file. In
	 * case originalText and adjustedText is identical take a look at {@link SimpleValidLinesFilter}.
	 *
	 * @param originalText
	 *            the text for which the input locations have been created, i.e. the text from the
	 *            analysis. May be {@code null}, in which case no adjustment is performed.
	 * @param adjustedText
	 *            the text for which the locations should be adjusted, i.e. the local text.
	 * @param adjustedUniformPath
	 *            the adjusted uniform path for adjusted findings.
	 */
	public LocationAdjuster(@Nullable String originalText, String adjustedText, String adjustedUniformPath) {
		this.adjustedUniformPath = adjustedUniformPath;
		if (originalText != null) {
			originalLineOffsetConverter = new LineOffsetConverter(originalText);
			lazyOriginalTokens = Suppliers.memoize(() -> toTokens(originalText));
		} else {
			originalLineOffsetConverter = new LineOffsetConverter(adjustedText);
			lazyOriginalTokens = Suppliers.memoize(() -> toTokens(adjustedText));
		}

		adjustedLineOffsetConverter = new LineOffsetConverter(adjustedText);
		lazyMappedAdjustedTokens = Suppliers
				.memoize(() -> calculateMappedAdjustedTokens(adjustedText, lazyOriginalTokens.get()));
	}

	/**
	 * Constructor.
	 *
	 * @param originalText
	 *            the text for which the input locations have been created, i.e. the text from the
	 *            analysis.
	 * @param adjustedText
	 *            the text for which the locations should be adjusted, i.e. the local text.
	 */
	public LocationAdjuster(String originalText, String adjustedText) {
		this(originalText, adjustedText, null);
	}

	/**
	 * Calculates the #mappedAdjustedTokens based on original tokens and adjusted text. May return null
	 * if adjustment is not possible due too many changes.
	 */
	private static List calculateMappedAdjustedTokens(String adjustedText,
			List originalTokens) {
		List adjustedTokens = toTokens(adjustedText);
		Delta delta = Diff.computeDelta(originalTokens, adjustedTokens, MAX_DIFF_SIZE);

		if (delta.getSize() >= MAX_DIFF_SIZE) {
			return null;
		}

		return calculateMappedAdjustedTokensFromDelta(delta, originalTokens, adjustedTokens);
	}

	/**
	 * Calculates the #mappedAdjustedTokens based on original tokens, adjusted text, and delta.
	 */
	private static List calculateMappedAdjustedTokensFromDelta(Delta delta,
			List originalTokens, List adjustedTokens) {
		List mappedAdjustedTokens = new ArrayList<>(Collections.nCopies(originalTokens.size(), null));
		int originalIndex = 0;
		int adjustedIndex = 0;
		for (int i = 0; i < delta.getSize(); ++i) {
			int position = delta.getPosition(i);
			if (position > 0) {
				position -= 1;

				while (adjustedIndex < position) {
					mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++));
				}
				adjustedIndex += 1;
			} else {
				position = -position - 1;

				while (originalIndex < position) {
					mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++));
				}
				originalIndex += 1;
			}
		}

		while (originalIndex < originalTokens.size()) {
			mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++));
		}

		return mappedAdjustedTokens;
	}

	/**
	 * Splits a string into tokens. A token can be:
	 * 

	 * A single non-identifier character
	 * A string that consists of java identifier characters (see
	 * {@link Character#isJavaIdentifierPart(char)}
	 * 
	 * This means that abc foo<bar> will be split to:
	 * [abc, foo, <, bar, >]
	 *
	 */
	@VisibleForTesting
	static List toTokens(String s) {
		List tokens = new ArrayList<>();
		int lastWordSeparator = -1;
		for (int i = 0; i < s.length(); i++) {
			char currentCharacter = s.charAt(i);
			if (!Character.isJavaIdentifierPart(currentCharacter)) {
				if (lastWordSeparator != i - 1) {
					tokens.add(new AdjusterToken(s.substring(lastWordSeparator + 1, i), lastWordSeparator + 1));
				}
				lastWordSeparator = i;
				if (!Character.isWhitespace(currentCharacter)) {
					tokens.add(new AdjusterToken(s.substring(i, i + 1), i));
				}
			}
		}
		if (lastWordSeparator <= s.length() - 2) {
			tokens.add(new AdjusterToken(s.substring(lastWordSeparator + 1), lastWordSeparator + 1));
		}
		return tokens;
	}

	/**
	 * Maps a zero-based offset range (both inclusive) to the adjusted string. Returns {@code null} if
	 * the region could not be approximately mapped.
	 */
	public @Nullable Region getAdjustedRegion(int originalStartOffset, int originalEndOffset) {
		List mappedAdjustedTokens = lazyMappedAdjustedTokens.get();
		if (mappedAdjustedTokens == null) {
			return null;
		}

		Region originalIndexRegion = findOriginalIndexRegion(originalStartOffset, originalEndOffset);
		if (originalIndexRegion.isEmpty()) {
			return null;
		}

		int numOriginalTokens = originalIndexRegion.getLength();
		int numAdjustedTokens = 0;

		AdjusterToken firstAdjustedToken = null;
		AdjusterToken lastAdjustedToken = null;
		for (int i = originalIndexRegion.getStart(); i <= originalIndexRegion.getEnd(); ++i) {
			AdjusterToken adjustedToken = mappedAdjustedTokens.get(i);
			if (adjustedToken != null) {
				numAdjustedTokens += 1;
				if (firstAdjustedToken == null) {
					firstAdjustedToken = adjustedToken;
				}
				lastAdjustedToken = adjustedToken;
			}
		}

		if (firstAdjustedToken == null || lastAdjustedToken == null
				|| LOSS_FACTOR * numAdjustedTokens < numOriginalTokens) {
			return null;
		}

		return new Region(firstAdjustedToken.startOffset, lastAdjustedToken.endOffset);
	}

	/**
	 * Returns the region of indexes in the {@link #lazyOriginalTokens} contained in the given offsets.
	 */
	private Region findOriginalIndexRegion(int originalStartOffset, int originalEndOffset) {
		List originalTokens = lazyOriginalTokens.get();
		AdjusterToken searchToken = new AdjusterToken(null, originalStartOffset, originalEndOffset);
		int originalStartTokenIndex = Collections.binarySearch(originalTokens, searchToken,
				AdjusterToken.COMPARE_BY_START_OFFSET);
		if (originalStartTokenIndex < 0) {
			originalStartTokenIndex = -originalStartTokenIndex - 1;
		}

		int originalEndTokenIndex = Collections.binarySearch(originalTokens, searchToken,
				AdjusterToken.COMPARE_BY_END_OFFSET);
		if (originalEndTokenIndex < 0) {
			// we want insertion point -1
			originalEndTokenIndex = -originalEndTokenIndex - 2;
		}
		if (originalEndTokenIndex + 1 < originalTokens.size()
				&& originalTokens.get(originalEndTokenIndex + 1).startOffset < originalEndOffset) {
			originalEndTokenIndex += 1;
		}

		return new Region(originalStartTokenIndex, originalEndTokenIndex);
	}

	@Override
	public ElementLocation adjustLocation(ElementLocation location) {
		if (location instanceof TextRegionLocation) {
			return adjustLocation((TextRegionLocation) location);
		}

		// other locations do not have offsets, if the uniform path should not
		// be adjusted simply return the original location.
		if (adjustedUniformPath == null) {
			return location;
		}

		if (location instanceof QualifiedNameLocation) {
			return new QualifiedNameLocation(((QualifiedNameLocation) location).getQualifiedName(),
					adjustedUniformPath);
		}

		return new ElementLocation(adjustedUniformPath);
	}

	/**
	 * Returns a new location with adjusted offsets (if necessary). Returns {@code null} if the location
	 * cannot be mapped to the adjusted text.
	 */
	public TextRegionLocation adjustLocation(TextRegionLocation location) {
		int startOffset = location.getRawStartOffset();
		int endOffset = location.getRawEndOffset();
		if (startOffset < 0) {
			int startLine = location.getRawStartLine();
			if (!originalLineOffsetConverter.isValidLine(startLine)) {
				return null;
			}
			startOffset = originalLineOffsetConverter.getOffset(startLine);
		}
		if (endOffset < 0) {
			int endLine = location.getRawEndLine() + 1;
			if (!originalLineOffsetConverter.isValidLine(endLine)) {
				return null;
			}
			endOffset = originalLineOffsetConverter.getOffset(endLine) - 1;
		}

		Region adjustedOffsets = getAdjustedRegion(startOffset, endOffset);

		if (adjustedOffsets == null || adjustedOffsets.isEmpty()) {
			return null;
		}

		String uniformPath = location.getUniformPath();
		if (adjustedUniformPath != null) {
			uniformPath = adjustedUniformPath;
		}

		int newStartOffset = adjustedOffsets.getStart();
		int newEndOffset = adjustedOffsets.getEnd();
		return new TextRegionLocation(uniformPath, newStartOffset, newEndOffset,
				adjustedLineOffsetConverter.getLine(newStartOffset), adjustedLineOffsetConverter.getLine(newEndOffset));
	}

	/**
	 * Adjusts the location of a single line. This only respects the token part of a line, i.e. leading
	 * and trailing whitespace of a line will be ignored. This method is robust w.r.t lines numbers that
	 * are out of the range of the original text. In case of such an invalid line, the line is logged as
	 * error to the given logger and {@code null} is returned.
	 *
	 * @param line
	 *            the one-based line number of be adjusted.
	 * @param invalidLines
	 *            used for collecting invalid lines.
	 *
	 * @return the one-based lines encoded as a region, as a line may map to multiple lines after
	 *         changing. This may also return null, if no non-empty lines could be found that correspond
	 *         to the input line after adjustment.
	 */
	@Override
	public LineBasedRegion adjustLine(int line, Set invalidLines) {
		if (!originalLineOffsetConverter.isValidLine(line) || !originalLineOffsetConverter.isValidLine(line + 1)) {
			invalidLines.add(line);
			return null;
		}
		int originalStartOffset = originalLineOffsetConverter.getOffset(line);
		int originalEndOffset = originalLineOffsetConverter.getOffset(line + 1) - 1;
		Region adjustedOffsets = getAdjustedRegion(originalStartOffset, originalEndOffset);
		if (adjustedOffsets == null) {
			return null;
		}

		int adjustedStartLine = adjustedLineOffsetConverter.getLine(adjustedOffsets.getStart());
		int adjustedEndLine = adjustedLineOffsetConverter.getLine(adjustedOffsets.getEnd());
		return new LineBasedRegion(adjustedStartLine, adjustedEndLine);
	}

	/** Returns the line count of the original text */
	@Override
	public int getOriginalLineCount() {
		return originalLineOffsetConverter.getLineCount();
	}

	/** Simple token representation used in location adjustment. */
	static class AdjusterToken {

		/** Compares by start offset. */
		private static final Comparator COMPARE_BY_START_OFFSET = Comparator
				.comparingInt(token -> token.startOffset);

		/** Compares by end offset. */
		private static final Comparator COMPARE_BY_END_OFFSET = Comparator
				.comparingInt(token -> token.endOffset);

		/** The text content. */
		private final String text;

		/** The start offset in the text. */
		private final int startOffset;

		/** The inclusive end offset in the text. */
		private final int endOffset;

		/** Constructor. */
		private AdjusterToken(String text, int startOffset) {
			this(text, startOffset, startOffset + text.length() - 1);
		}

		/** Constructor. */
		private AdjusterToken(String text, int startOffset, int endOffset) {
			this.text = text;
			this.startOffset = startOffset;
			this.endOffset = endOffset;
		}

		/** {@inheritDoc} */
		@Override
		public boolean equals(Object obj) {
			return (obj instanceof AdjusterToken) && ((AdjusterToken) obj).text.equals(text);
		}

		@Override
		public String toString() {
			return "AdjusterToken{" + "text='" + text + '\'' + ", startOffset=" + startOffset + ", endOffset="
					+ endOffset + '}';
		}

		/** {@inheritDoc} */
		@Override
		public int hashCode() {
			return text.hashCode();
		}
	}
}