![JAR search and dependency download from the Maven repository](/logo.png)
com.credibledoc.log.labelizer.hint.SimilarityHint Maven / Gradle / Ivy
package com.credibledoc.log.labelizer.hint;
import com.credibledoc.log.labelizer.exception.LabelizerRuntimeException;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Marks columns in a multi-line content. See the
* {@link #linesSimilarityMarker(String)} method description.
*
* @author Kyrylo Semenko
*/
public class SimilarityHint {
private static final char N_NO_REPETITION = 'n';
private static final char W_WITH_REPETITION = 'w';
private SimilarityHint() {
throw new LabelizerRuntimeException("Please do not instantiate the static helper.");
}
/**
* Mark columns with the same characters within column as 'w' ({@link #W_WITH_REPETITION})
* and mark other columns as 'n' ({@link #N_NO_REPETITION}).
* For example if we have three lines:
*
* abd
* dcd
* aef
*
* then the result will be wnw, because first column aba contains repeated char 'a'
* and third column ddf contains repeated char 'd'. Middle (second) column contains dce with
* chars without repetitions hence it marked as 'n' ({@link #N_NO_REPETITION}).
*
* @param inputLines multiple lines separated with Unix \n or Windows \r\n.
* @return String with the same length as inputLines and with 'n' ({@link #N_NO_REPETITION})
* or 'w' ({@link #W_WITH_REPETITION}) markers only. It will help network to understand repeated patterns
* in multiple lines (rows).
*/
public static String linesSimilarityMarker(String inputLines) {
StringBuilder result = new StringBuilder(inputLines.length());
Map> map = new HashMap<>();
List currentRow = new ArrayList<>();
int lineIndex = 0;
map.put(lineIndex, currentRow);
int maxLen = 0;
int index = 0;
for (char character : inputLines.toCharArray()) {
currentRow.add(character);
index++;
if (character == '\n') {
maxLen = Math.max(maxLen, currentRow.size());
if (index < inputLines.length() - 1) {
lineIndex++;
currentRow = new ArrayList<>();
map.put(lineIndex, currentRow);
}
}
}
List columns = new ArrayList<>(lineIndex + 1);
addColumns(map, lineIndex, maxLen, columns);
for (Map.Entry> entry : map.entrySet()) {
labelizeColumns(result, columns, entry);
}
return result.toString();
}
private static void labelizeColumns(StringBuilder result, List columns, Map.Entry> entry) {
List list = entry.getValue();
for (int i = 0; i < list.size(); i++) {
Character character = list.get(i);
if (columns.size() > i) {
String column = columns.get(i);
int count = StringUtils.countMatches(column, character);
if (count > 1) {
result.append(W_WITH_REPETITION);
} else {
result.append(N_NO_REPETITION);
}
} else {
result.append(N_NO_REPETITION);
}
}
}
private static void addColumns(Map> map, int lineIndex, int maxLen, List columns) {
for (int columnIndex = 0; columnIndex < maxLen; columnIndex++) {
StringBuilder stringBuilder = new StringBuilder(lineIndex + 1);
for (Map.Entry> entry : map.entrySet()) {
List list = entry.getValue();
if (list.size() > columnIndex) {
stringBuilder.append(list.get(columnIndex));
}
}
columns.add(stringBuilder.toString());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy