org.conqat.engine.commons.findings.location.LocationAdjuster Maven / Gradle / Ivy
/*
* Copyright (c) CQSE GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.conqat.engine.commons.findings.location;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.function.Supplier;
import org.checkerframework.checker.nullness.qual.Nullable;
import org.conqat.lib.commons.algo.Diff;
import org.conqat.lib.commons.algo.Diff.Delta;
import org.conqat.lib.commons.region.LineBasedRegion;
import org.conqat.lib.commons.region.Region;
import org.conqat.lib.commons.string.LineOffsetConverter;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Suppliers;
/**
* This class is used for adjusting the offsets used in locations (i.e. subclasses of
* {@link ElementLocation} for text that is slightly modified. The main use-case is the update of
* locations where the local (adjusted) text has different line ending, different content due to
* keyword expansion, or minor local modifications compared to the text on which the analysis was
* executed (original text).
*
* Both the original and adjusted text may have arbitrary line endings.
*
* The implementation is based on a token diff, which can lead to minor deviations for offsets that
* are not aligned with token boundaries. A character diff would be more precise, but is too
* performance and memory intensive for large files.
*/
public class LocationAdjuster implements ILineAdjuster {
/**
* If the number of tokens in the adjusted region differs by the tokens in the original region by
* more than this factor, the mapping is counted as wrong.
*/
private static final double LOSS_FACTOR = 2;
/** Maximal number of tokens in the diff we accept. */
private static final int MAX_DIFF_SIZE = 5000;
/** The tokens of the original string. */
private final Supplier> lazyOriginalTokens;
/**
* Adjusted tokens corresponding to the {@link #lazyOriginalTokens}. If there is no corresponding
* token, this list contains null at the index. If the content could not be matched/adjusted at all
* (too many differences), this field is null.
*/
private final Supplier> lazyMappedAdjustedTokens;
/** Line offset converted for the original text. */
private final LineOffsetConverter originalLineOffsetConverter;
/** Line offset converted for the adjusted text. */
private final LineOffsetConverter adjustedLineOffsetConverter;
/**
* Optional uniform path to correct element locations of {@link #adjustLocation(ElementLocation)}.
* Null if no correction will be performed.
*/
private final String adjustedUniformPath;
/**
* Constructor.
*
* WARNING: Creating a location adjuster is very expensive and should only be done once per file. In
* case originalText and adjustedText is identical take a look at {@link SimpleValidLinesFilter}.
*
* @param originalText
* the text for which the input locations have been created, i.e. the text from the
* analysis. May be {@code null}, in which case no adjustment is performed.
* @param adjustedText
* the text for which the locations should be adjusted, i.e. the local text.
* @param adjustedUniformPath
* the adjusted uniform path for adjusted findings.
*/
public LocationAdjuster(@Nullable String originalText, String adjustedText, String adjustedUniformPath) {
this.adjustedUniformPath = adjustedUniformPath;
if (originalText != null) {
originalLineOffsetConverter = new LineOffsetConverter(originalText);
lazyOriginalTokens = Suppliers.memoize(() -> toTokens(originalText));
} else {
originalLineOffsetConverter = new LineOffsetConverter(adjustedText);
lazyOriginalTokens = Suppliers.memoize(() -> toTokens(adjustedText));
}
adjustedLineOffsetConverter = new LineOffsetConverter(adjustedText);
lazyMappedAdjustedTokens = Suppliers
.memoize(() -> calculateMappedAdjustedTokens(adjustedText, lazyOriginalTokens.get()));
}
/**
* Constructor.
*
* @param originalText
* the text for which the input locations have been created, i.e. the text from the
* analysis.
* @param adjustedText
* the text for which the locations should be adjusted, i.e. the local text.
*/
public LocationAdjuster(String originalText, String adjustedText) {
this(originalText, adjustedText, null);
}
/**
* Calculates the #mappedAdjustedTokens based on original tokens and adjusted text. May return null
* if adjustment is not possible due too many changes.
*/
private static List calculateMappedAdjustedTokens(String adjustedText,
List originalTokens) {
List adjustedTokens = toTokens(adjustedText);
Delta delta = Diff.computeDelta(originalTokens, adjustedTokens, MAX_DIFF_SIZE);
if (delta.getSize() >= MAX_DIFF_SIZE) {
return null;
}
return calculateMappedAdjustedTokensFromDelta(delta, originalTokens, adjustedTokens);
}
/**
* Calculates the #mappedAdjustedTokens based on original tokens, adjusted text, and delta.
*/
private static List calculateMappedAdjustedTokensFromDelta(Delta delta,
List originalTokens, List adjustedTokens) {
List mappedAdjustedTokens = new ArrayList<>(Collections.nCopies(originalTokens.size(), null));
int originalIndex = 0;
int adjustedIndex = 0;
for (int i = 0; i < delta.getSize(); ++i) {
int position = delta.getPosition(i);
if (position > 0) {
position -= 1;
while (adjustedIndex < position) {
mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++));
}
adjustedIndex += 1;
} else {
position = -position - 1;
while (originalIndex < position) {
mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++));
}
originalIndex += 1;
}
}
while (originalIndex < originalTokens.size()) {
mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++));
}
return mappedAdjustedTokens;
}
/**
* Splits a string into tokens. A token can be:
*
* - A single non-identifier character
* - A string that consists of java identifier characters (see
* {@link Character#isJavaIdentifierPart(char)}
*
* This means that abc foo<bar>
will be split to:
* [abc, foo, <, bar, >]
*
*/
@VisibleForTesting
static List toTokens(String s) {
List tokens = new ArrayList<>();
int lastWordSeparator = -1;
for (int i = 0; i < s.length(); i++) {
char currentCharacter = s.charAt(i);
if (!Character.isJavaIdentifierPart(currentCharacter)) {
if (lastWordSeparator != i - 1) {
tokens.add(new AdjusterToken(s.substring(lastWordSeparator + 1, i), lastWordSeparator + 1));
}
lastWordSeparator = i;
if (!Character.isWhitespace(currentCharacter)) {
tokens.add(new AdjusterToken(s.substring(i, i + 1), i));
}
}
}
if (lastWordSeparator <= s.length() - 2) {
tokens.add(new AdjusterToken(s.substring(lastWordSeparator + 1), lastWordSeparator + 1));
}
return tokens;
}
/**
* Maps a zero-based offset range (both inclusive) to the adjusted string. Returns {@code null} if
* the region could not be approximately mapped.
*/
public @Nullable Region getAdjustedRegion(int originalStartOffset, int originalEndOffset) {
List mappedAdjustedTokens = lazyMappedAdjustedTokens.get();
if (mappedAdjustedTokens == null) {
return null;
}
Region originalIndexRegion = findOriginalIndexRegion(originalStartOffset, originalEndOffset);
if (originalIndexRegion.isEmpty()) {
return null;
}
int numOriginalTokens = originalIndexRegion.getLength();
int numAdjustedTokens = 0;
AdjusterToken firstAdjustedToken = null;
AdjusterToken lastAdjustedToken = null;
for (int i = originalIndexRegion.getStart(); i <= originalIndexRegion.getEnd(); ++i) {
AdjusterToken adjustedToken = mappedAdjustedTokens.get(i);
if (adjustedToken != null) {
numAdjustedTokens += 1;
if (firstAdjustedToken == null) {
firstAdjustedToken = adjustedToken;
}
lastAdjustedToken = adjustedToken;
}
}
if (firstAdjustedToken == null || lastAdjustedToken == null
|| LOSS_FACTOR * numAdjustedTokens < numOriginalTokens) {
return null;
}
return new Region(firstAdjustedToken.startOffset, lastAdjustedToken.endOffset);
}
/**
* Returns the region of indexes in the {@link #lazyOriginalTokens} contained in the given offsets.
*/
private Region findOriginalIndexRegion(int originalStartOffset, int originalEndOffset) {
List originalTokens = lazyOriginalTokens.get();
AdjusterToken searchToken = new AdjusterToken(null, originalStartOffset, originalEndOffset);
int originalStartTokenIndex = Collections.binarySearch(originalTokens, searchToken,
AdjusterToken.COMPARE_BY_START_OFFSET);
if (originalStartTokenIndex < 0) {
originalStartTokenIndex = -originalStartTokenIndex - 1;
}
int originalEndTokenIndex = Collections.binarySearch(originalTokens, searchToken,
AdjusterToken.COMPARE_BY_END_OFFSET);
if (originalEndTokenIndex < 0) {
// we want insertion point -1
originalEndTokenIndex = -originalEndTokenIndex - 2;
}
if (originalEndTokenIndex + 1 < originalTokens.size()
&& originalTokens.get(originalEndTokenIndex + 1).startOffset < originalEndOffset) {
originalEndTokenIndex += 1;
}
return new Region(originalStartTokenIndex, originalEndTokenIndex);
}
@Override
public ElementLocation adjustLocation(ElementLocation location) {
if (location instanceof TextRegionLocation) {
return adjustLocation((TextRegionLocation) location);
}
// other locations do not have offsets, if the uniform path should not
// be adjusted simply return the original location.
if (adjustedUniformPath == null) {
return location;
}
if (location instanceof QualifiedNameLocation) {
return new QualifiedNameLocation(((QualifiedNameLocation) location).getQualifiedName(),
adjustedUniformPath);
}
return new ElementLocation(adjustedUniformPath);
}
/**
* Returns a new location with adjusted offsets (if necessary). Returns {@code null} if the location
* cannot be mapped to the adjusted text.
*/
public TextRegionLocation adjustLocation(TextRegionLocation location) {
int startOffset = location.getRawStartOffset();
int endOffset = location.getRawEndOffset();
if (startOffset < 0) {
int startLine = location.getRawStartLine();
if (!originalLineOffsetConverter.isValidLine(startLine)) {
return null;
}
startOffset = originalLineOffsetConverter.getOffset(startLine);
}
if (endOffset < 0) {
int endLine = location.getRawEndLine() + 1;
if (!originalLineOffsetConverter.isValidLine(endLine)) {
return null;
}
endOffset = originalLineOffsetConverter.getOffset(endLine) - 1;
}
Region adjustedOffsets = getAdjustedRegion(startOffset, endOffset);
if (adjustedOffsets == null || adjustedOffsets.isEmpty()) {
return null;
}
String uniformPath = location.getUniformPath();
if (adjustedUniformPath != null) {
uniformPath = adjustedUniformPath;
}
int newStartOffset = adjustedOffsets.getStart();
int newEndOffset = adjustedOffsets.getEnd();
return new TextRegionLocation(uniformPath, newStartOffset, newEndOffset,
adjustedLineOffsetConverter.getLine(newStartOffset), adjustedLineOffsetConverter.getLine(newEndOffset));
}
/**
* Adjusts the location of a single line. This only respects the token part of a line, i.e. leading
* and trailing whitespace of a line will be ignored. This method is robust w.r.t lines numbers that
* are out of the range of the original text. In case of such an invalid line, the line is logged as
* error to the given logger and {@code null} is returned.
*
* @param line
* the one-based line number of be adjusted.
* @param invalidLines
* used for collecting invalid lines.
*
* @return the one-based lines encoded as a region, as a line may map to multiple lines after
* changing. This may also return null, if no non-empty lines could be found that correspond
* to the input line after adjustment.
*/
@Override
public LineBasedRegion adjustLine(int line, Set invalidLines) {
if (!originalLineOffsetConverter.isValidLine(line) || !originalLineOffsetConverter.isValidLine(line + 1)) {
invalidLines.add(line);
return null;
}
int originalStartOffset = originalLineOffsetConverter.getOffset(line);
int originalEndOffset = originalLineOffsetConverter.getOffset(line + 1) - 1;
Region adjustedOffsets = getAdjustedRegion(originalStartOffset, originalEndOffset);
if (adjustedOffsets == null) {
return null;
}
int adjustedStartLine = adjustedLineOffsetConverter.getLine(adjustedOffsets.getStart());
int adjustedEndLine = adjustedLineOffsetConverter.getLine(adjustedOffsets.getEnd());
return new LineBasedRegion(adjustedStartLine, adjustedEndLine);
}
/** Returns the line count of the original text */
@Override
public int getOriginalLineCount() {
return originalLineOffsetConverter.getLineCount();
}
/** Simple token representation used in location adjustment. */
static class AdjusterToken {
/** Compares by start offset. */
private static final Comparator COMPARE_BY_START_OFFSET = Comparator
.comparingInt(token -> token.startOffset);
/** Compares by end offset. */
private static final Comparator COMPARE_BY_END_OFFSET = Comparator
.comparingInt(token -> token.endOffset);
/** The text content. */
private final String text;
/** The start offset in the text. */
private final int startOffset;
/** The inclusive end offset in the text. */
private final int endOffset;
/** Constructor. */
private AdjusterToken(String text, int startOffset) {
this(text, startOffset, startOffset + text.length() - 1);
}
/** Constructor. */
private AdjusterToken(String text, int startOffset, int endOffset) {
this.text = text;
this.startOffset = startOffset;
this.endOffset = endOffset;
}
/** {@inheritDoc} */
@Override
public boolean equals(Object obj) {
return (obj instanceof AdjusterToken) && ((AdjusterToken) obj).text.equals(text);
}
@Override
public String toString() {
return "AdjusterToken{" + "text='" + text + '\'' + ", startOffset=" + startOffset + ", endOffset="
+ endOffset + '}';
}
/** {@inheritDoc} */
@Override
public int hashCode() {
return text.hashCode();
}
}
}