All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.credibledoc.log.labelizer.hint.SimilarityHint Maven / Gradle / Ivy

package com.credibledoc.log.labelizer.hint;

import com.credibledoc.log.labelizer.exception.LabelizerRuntimeException;
import org.apache.commons.lang3.StringUtils;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Marks columns in a multi-line content. See the 
 * {@link #linesSimilarityMarker(String)} method description.
 * 
 * @author Kyrylo Semenko
 */
public class SimilarityHint {
    private static final char N_NO_REPETITION = 'n';
    private static final char W_WITH_REPETITION = 'w';
    
    private SimilarityHint() {
        throw new LabelizerRuntimeException("Please do not instantiate the static helper.");
    }

    /**
     * Mark columns with the same characters within column as 'w' ({@link #W_WITH_REPETITION})
     * and mark other columns as 'n' ({@link #N_NO_REPETITION}).
     * For example if we have three lines:
     * 
     *     abd
     *     dcd
     *     aef
     * 
* then the result will be wnw, because first column aba contains repeated char 'a' * and third column ddf contains repeated char 'd'. Middle (second) column contains dce with * chars without repetitions hence it marked as 'n' ({@link #N_NO_REPETITION}). * * @param inputLines multiple lines separated with Unix \n or Windows \r\n. * @return String with the same length as inputLines and with 'n' ({@link #N_NO_REPETITION}) * or 'w' ({@link #W_WITH_REPETITION}) markers only. It will help network to understand repeated patterns * in multiple lines (rows). */ public static String linesSimilarityMarker(String inputLines) { StringBuilder result = new StringBuilder(inputLines.length()); Map> map = new HashMap<>(); List currentRow = new ArrayList<>(); int lineIndex = 0; map.put(lineIndex, currentRow); int maxLen = 0; int index = 0; for (char character : inputLines.toCharArray()) { currentRow.add(character); index++; if (character == '\n') { maxLen = Math.max(maxLen, currentRow.size()); if (index < inputLines.length() - 1) { lineIndex++; currentRow = new ArrayList<>(); map.put(lineIndex, currentRow); } } } List columns = new ArrayList<>(lineIndex + 1); addColumns(map, lineIndex, maxLen, columns); for (Map.Entry> entry : map.entrySet()) { labelizeColumns(result, columns, entry); } return result.toString(); } private static void labelizeColumns(StringBuilder result, List columns, Map.Entry> entry) { List list = entry.getValue(); for (int i = 0; i < list.size(); i++) { Character character = list.get(i); if (columns.size() > i) { String column = columns.get(i); int count = StringUtils.countMatches(column, character); if (count > 1) { result.append(W_WITH_REPETITION); } else { result.append(N_NO_REPETITION); } } else { result.append(N_NO_REPETITION); } } } private static void addColumns(Map> map, int lineIndex, int maxLen, List columns) { for (int columnIndex = 0; columnIndex < maxLen; columnIndex++) { StringBuilder stringBuilder = new StringBuilder(lineIndex + 1); for (Map.Entry> entry : map.entrySet()) { List list = entry.getValue(); if (list.size() > columnIndex) { stringBuilder.append(list.get(columnIndex)); } } columns.add(stringBuilder.toString()); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy