com.credibledoc.log.labelizer.iterator.CharIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of log-labelizer Show documentation
This library is used for finding date - time and thread labels in log lines.
There is a newer version: 1.0.51
package com.credibledoc.log.labelizer.iterator;

import com.credibledoc.log.labelizer.classifier.LinesWithDateClassification;
import com.credibledoc.log.labelizer.date.DateExample;
import com.credibledoc.log.labelizer.date.ProbabilityLabel;
import com.credibledoc.log.labelizer.exception.LabelizerRuntimeException;
import com.credibledoc.log.labelizer.hint.Hint;
import com.credibledoc.log.labelizer.hint.IpGenerator;
import com.credibledoc.log.labelizer.hint.SimilarityHint;
import com.credibledoc.log.labelizer.pagepattern.PagePattern;
import com.credibledoc.log.labelizer.pagepattern.PagePatternRepository;
import com.credibledoc.log.labelizer.training.TrainingDataGenerator;
import com.google.common.primitives.Chars;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.dataset.api.MultiDataSet;
import org.nd4j.linalg.dataset.api.MultiDataSetPreProcessor;
import org.nd4j.linalg.dataset.api.iterator.MultiDataSetIterator;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.io.ClassPathResource;
import org.nd4j.linalg.primitives.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.TimeZone;
import java.util.concurrent.ThreadLocalRandom;
import java.util.function.Consumer;

/**
 * Provides data for training and testing.
 * 
 * @author Kyrylo Semenko
 */
public class CharIterator implements MultiDataSetIterator {
    public static final String RESOURCES_DIR = "vectors";

    private static final Logger logger = LoggerFactory.getLogger(CharIterator.class);
    private static final String NOT_IMPLEMENTED = "Not implemented";
    public static final String NATIONAL_CHARS_TXT = "chars/nationalChars.txt";
    private static final char ORDERING_FORTRAN = 'f';
    private static final String MINI_BATCH = "MiniBatch: {}: {}";
    private static final int NUM_EXAMPLES_OF_DATE_PATTERN_100 = 100;


    private static final List PUNCTUATIONS = new ArrayList<>(Arrays.asList('!', '&', '(', ')', '?', '-',
        '\\', ',', '.', '\"', ':', ';', ' '));

    private static final List SMALL_LETTERS = listChars('a', 'z');
    private static final List LARGE_LETTERS = listChars('A', 'Z');
    private static final List DIGITS = listChars('0', '9');
    private static final List DIGITS_AND_LETTERS = new ArrayList<>();
    private static final List DIGITS_AND_LETTERS_AND_PUNCTUATIONS = new ArrayList<>();
    private static final List SEPARATORS = new ArrayList<>(Arrays.asList("|", " ", "/", ",", "'", "-", "_"));

    private static final List BOUNDARIES = new ArrayList<>(Arrays.asList(
        // repeated for more probability
        "  ", "  ", "  ", "  ", "  ",
        "{}", "()", "()", "()",
        "//", ",,", "\"\"", "''", "--", "__", "||",
        "[]", "[]", "[]", "[]", "[]"));

    private static final List THREAD_COMMON_NAMES = new ArrayList<>(Arrays.asList("thread", "main", "worker", "job", "pool",
        "local", "exec", "Main"));

    private static final List LOG_LEVELS = new ArrayList<>(Arrays.asList("ALL", "TRACE", "DEBUG", "INFO", "WARN", "WARNING",
        "ERROR", "SEVERE", "FATAL", "CONFIG", "FINE", "FINER", "FINEST", "CRITICAL", "VERBOSE", "Trace", "Debug", "Info", "Warn", "Error"));
    private static final int KEY_FOR_MISSED_CHARS = 0;
    public static final String NEW_CHARS_TXT = "chars/newChars.txt";


    /**
     * Maps each character to an index in the input/output. These characters is used in train and test data.
     * 
     * Key is an order number and value is a character.
     */
    private Map intToCharMap;

    /**
     * The examples of a date string where individual parts are marked with labels.
     * This list contains max {@link #NUM_EXAMPLES_OF_DATE_PATTERN_100} elements and the new elements where appends
     * continuously until some patterns exists in the database.
     */
    private transient List dateExamples = new ArrayList<>();

    /**
     * Provides characters without labels for filling out of gaps between the labeled data.
     */
    private transient LineFiller lineFiller;

    /**
     * Length of each example/minibatch (number of characters)
     */
    private int exampleLength;

    /**
     * Size of each minibatch (number of examples)
     */
    private int miniBatchSize;

    /**
     * The current trained {@link PagePattern}.
     */
    private transient PagePattern lastPagePattern;

    /**
     * How many {@link PagePattern}s for a training the database contains before start of the training.
     */
    private long patternsCount;

    /**
     * How many {@link PagePattern}s has been trained.
     */
    private long patternsPassed = 0;

    /**
     * Resource files directory.
     */
    private final String resourcesDirPath;

    static {
        DIGITS_AND_LETTERS.addAll(LARGE_LETTERS);
        DIGITS_AND_LETTERS.addAll(SMALL_LETTERS);
        DIGITS_AND_LETTERS.addAll(DIGITS);
        DIGITS_AND_LETTERS_AND_PUNCTUATIONS.addAll(DIGITS_AND_LETTERS);
        DIGITS_AND_LETTERS_AND_PUNCTUATIONS.addAll(PUNCTUATIONS);
        DIGITS_AND_LETTERS_AND_PUNCTUATIONS.addAll(readNationalChars());
    }

    private static Collection readNationalChars() {
        try {
            String chars = getNationalCharsFromFile();
            return new ArrayList<>(Chars.asList(chars.toCharArray()));
        } catch (Exception e) {
            throw new LabelizerRuntimeException(e);
        }
    }

    private static String getNationalCharsFromFile() throws IOException {
        ClassPathResource resource = new ClassPathResource(CharIterator.RESOURCES_DIR + "/" + NATIONAL_CHARS_TXT);
        File file = resource.getFile();
        if (!file.exists()) {
            throw new LabelizerRuntimeException("File not found: '" + file.getAbsolutePath() + "'");
        }
        return new String(Files.readAllBytes(file.toPath()));
    }

    /**
     * @param resourcesDirPath     Path to text file to use for generating samples
     * @param charset Encoding of the text file(s). Can try Charset.defaultCharset()
     * @param miniBatchSize    Number of examples per mini-batch
     * @param exampleLength    Number of characters in each input/output vector
     * @throws IOException If text file cannot be loaded
     */
    public CharIterator(String resourcesDirPath, Charset charset, int miniBatchSize,
                        int exampleLength) throws IOException {
        if (!new File(resourcesDirPath).exists()) {
            throw new IOException("Could not access file (does not exist): " + resourcesDirPath);
        }
        if (miniBatchSize <= 0) {
            throw new IllegalArgumentException("Invalid miniBatchSize (must be > 0)");
        }
        this.resourcesDirPath = resourcesDirPath;
        this.exampleLength = exampleLength;
        this.miniBatchSize = miniBatchSize;

        //Store valid characters is a map for later use in vectorization
        initCharToIdxMap();
        
        lineFiller = new LineFiller(resourcesDirPath, charset);
        patternsCount = PagePatternRepository.getInstance().countNotTrainedPatterns();
    }

    public List readLinesFromFolder(String resourcesDirPath, Charset textFileEncoding, String folderName) throws IOException {
        File dateDir = new File(resourcesDirPath, folderName);
        Collection dateFiles = FileUtils.listFiles(dateDir, null, false);
        List exampleLines = new ArrayList<>();
        for (File file : dateFiles) {
            exampleLines.addAll(Files.readAllLines(file.toPath(), textFileEncoding));
        }
        checkMissingChars(resourcesDirPath, exampleLines);
        return exampleLines;
    }

    private void checkMissingChars(String resourcesDirPath, List exampleLines) throws IOException {
        List missingChars = new ArrayList<>();
        for (String line : exampleLines) {
            char[] thisLine = line.toCharArray();
            for (char nextChar : thisLine) {
                if (!intToCharMap.containsValue(nextChar) && !missingChars.contains(nextChar)) {
                    missingChars.add(nextChar);
                }
            }
        }
        if (!missingChars.isEmpty()) {
            StringBuilder stringBuilder = new StringBuilder();
            for (Character next : missingChars) {
                stringBuilder.append(next);
            }
            File charsFile = new File(resourcesDirPath, NATIONAL_CHARS_TXT);
            logger.info("File will be created: '{}'", charsFile.getAbsolutePath());
            if (!charsFile.getParentFile().mkdirs()) {
                logger.info("Not created '{}'", charsFile.getParentFile().getAbsolutePath());
            }
            String existingChars = getNationalCharsFromFile();
            try (BufferedWriter writer = new BufferedWriter(new FileWriter(charsFile))) {
                writer.write(existingChars);
                writer.write(stringBuilder.toString());
            }
            throw new LabelizerRuntimeException("missingChars:" + stringBuilder.toString());
        }
    }

    private void initCharToIdxMap() {
        intToCharMap = new HashMap<>();
        char[] chars = getCharacters();
        for (int i = 0; i < chars.length; i++) {
            intToCharMap.put(i, chars[i]);
        }
        String escaped = intToCharMap.toString()
            .replaceAll("(\\t)+", "\\\\t")
            .replaceAll("(\\r)+", "\\\\r")
            .replaceAll("(\\n)+", "\\\\n");
        logger.info("All used characters in the charToIdxMap: {}", escaped);
    }

    /**
     * A minimal character set, with a-z, A-Z, 0-9 and common punctuation etc.
     * As per getMinimalCharacterSet(), but with a few extra characters.
     * @return Char array of available characters for the encoding
     */
    private static char[] getCharacters() {
        List validChars = new ArrayList<>(DIGITS_AND_LETTERS_AND_PUNCTUATIONS);
        
        Character[] temp = {'\n', '\t', '\r'};
        validChars.addAll(Arrays.asList(temp));

        Character[] additionalChars = {'@', '#', '$', '%', '^', '*', '{', '}', '[', ']', '/', '+', '_',
            '\\', '|', '<', '>', '='};
        validChars.addAll(Arrays.asList(additionalChars));

        char[] out = new char[validChars.size()];
        int i = 0;
        for (Character c : validChars) out[i++] = c;
        return out;
    }

    private static List listChars(char from, char to) {
        List result = new ArrayList<>();
        for (char c = from; c <= to; c++) {
            result.add(c);
        }
        return result;
    }

    public int convertCharacterToIndex(char character) {
        for (Map.Entry entry : intToCharMap.entrySet()) {
            if (character == entry.getValue()) {
                return entry.getKey();
            }
        }
        File charsFile = new File(resourcesDirPath + "/", NEW_CHARS_TXT);
        if (!charsFile.exists()) {
            try {
                Files.createFile(charsFile.toPath());
            } catch (IOException e) {
                throw new LabelizerRuntimeException(e);
            }
        }
        String chars;
        try {
            chars = new String(Files.readAllBytes(charsFile.toPath()));
        } catch (Exception e) {
            throw new LabelizerRuntimeException(e);
        }
        if (!chars.contains(Character.toString(character))) {
            try (FileWriter fileWriter = new FileWriter(charsFile, true)) {
                String message = "Cannot find index for character: '" + character + "'. It will be write to the file '" +
                    charsFile.getAbsolutePath() + "'. The default character with key '" + KEY_FOR_MISSED_CHARS +
                    "' will be used. Please merge the " +
                    "file content to the '" + NATIONAL_CHARS_TXT + "' file.";
                logger.info(message);
                fileWriter.append(character);
            } catch (Exception e) {
                throw new LabelizerRuntimeException(e);
            }
        }
        return KEY_FOR_MISSED_CHARS;
    }

    public boolean hasNext() {
        return hasMoreExamples();
    }

    private boolean hasMoreExamples() {
        return !dateExamples.isEmpty() || patternsCount > patternsPassed;
    }

    public MultiDataSet next() {
        try {
            return next(miniBatchSize);
        } catch (Exception e) {
            logger.error(e.getMessage(), e);
            throw new NoSuchElementException(e.getMessage());
        }
    }

    public MultiDataSet next(int miniBatches) {
        if (!hasMoreExamples()) {
            throw new NoSuchElementException();
        }
        List> examples = new ArrayList<>(miniBatches);
        // Create pairs where first row is examples and second row is labels, for example
        // 1) 28.2.2019 11:45:00.123 1234567654 abcde 28.2.2018 11:46:00.124...
        // 2) ddddddddddddddddddddddwwwwwwwwwwwwwwwwwwdddddddddddddddddddddd...
        // Every pair has the same length and they equal with exampleLength
        for (int i = 0; i < miniBatches && hasMoreExamples(); i++) {
            StringBuilder left = new StringBuilder(exampleLength);
            StringBuilder right = new StringBuilder(exampleLength);

            int randomIndex = randomFromZeroToMaxInclusive(LinesWithDateClassification.NUM_SUB_LINES.size() - 1);
            int numSubLines = LinesWithDateClassification.NUM_SUB_LINES.get(randomIndex);
            prepareDataForLearning(left, right, numSubLines);
                
            String exampleString = left.toString();
            String labelsString = right.toString();
            Pair pair = new Pair<>(exampleString, labelsString);
            examples.add(pair);
        }

        int currMinibatchSize = Math.min(miniBatches, examples.size());
        //Allocate space:
        //Note the order here:
        // dimension 0 = number of examples in minibatch
        // dimension 1 = size of each vector (i.e., number of characters or number of labels)
        // dimension 2 = length of each time series/example
        // Why 'f' order here? See https://jrmerwin.github.io/deeplearning4j-docs/usingrnns.html,
        // section "Alternative: Implementing a custom DataSetIterator"
        INDArray input = Nd4j.create(new int[]{currMinibatchSize, intToCharMap.size(), exampleLength}, ORDERING_FORTRAN); // NOSONAR
        INDArray inputHint = Nd4j.create(new int[]{currMinibatchSize, 2, exampleLength}, ORDERING_FORTRAN); // NOSONAR
        INDArray labels = Nd4j.create(new int[]{currMinibatchSize, ProbabilityLabel.values().length, exampleLength}, ORDERING_FORTRAN); // NOSONAR

        for (int miniBatchIndex = 0; miniBatchIndex < currMinibatchSize; miniBatchIndex++) {
            String stringLine = examples.get(miniBatchIndex).getLeft();
            
            String hintLine = SimilarityHint.linesSimilarityMarker(stringLine);

            String labelsLine = examples.get(miniBatchIndex).getRight();

            if (stringLine.length() != labelsLine.length()) {
                String lines =
                    "\nLength: '" + stringLine.length() + "', line  : '" + stringLine + "'" +
                        "\nLength: '" + labelsLine.length() + "', labels: '" + labelsLine + "'";
                throw new LabelizerRuntimeException("Line and labeled line should have same lengths.\n" + lines);
            }
            logger.info(MINI_BATCH, miniBatchIndex, stringLine);
            logger.info(MINI_BATCH, miniBatchIndex, labelsLine);
            logger.info(MINI_BATCH, miniBatchIndex, hintLine);
            logger.info("");
            for (int charIndex = 0; charIndex < stringLine.length(); charIndex++) {
                char exampleChar = stringLine.charAt(charIndex);
                char labelChar = labelsLine.charAt(charIndex);
                char hintChar = hintLine.charAt(charIndex);
                int exampleCharIndex = convertCharacterToIndex(exampleChar);
                int hintCharIndex = hintChar == 'n' ? 0 : 1;
                int labelCharIndex = ProbabilityLabel.findIndex(labelChar);
                input.putScalar(new int[]{miniBatchIndex, exampleCharIndex, charIndex}, 1.0);
                inputHint.putScalar(new int[]{miniBatchIndex, hintCharIndex, charIndex}, 1.0);
                labels.putScalar(new int[]{miniBatchIndex, labelCharIndex, charIndex}, 1.0);
            }
        }

        return new org.nd4j.linalg.dataset.MultiDataSet(
            new INDArray[]{input, inputHint},
            new INDArray[]{labels}
        );
    }

    @Override
    public void setPreProcessor(MultiDataSetPreProcessor preProcessor) {
        throw new UnsupportedOperationException(NOT_IMPLEMENTED);
    }

    /**
     * Fill the buffers with data. These data will be used for training.
     * 

     * StringLine contains several sub-lines divided with line separators.
     * 

     * Sub-line can be a main line or additional line. The main line contains date, thread name and log level,
     * for example
     * 
     *     01.11.2019;00:00:01.095 DEBUG [DefaultQuartzScheduler_Worker-9] [  /   ] - Main... (main line)
     * .
     * 
     * The additional line follows the main line, for example
     * 
     *     01.11.2019;12:52:09.721 ERROR [DefaultQuartzScheduler_Worker-3] [  /   ] - Error in... (main line)
     *        at com.bla.security(Class.java:65) (additional line)
     * 
     * Examples
     *      *     main line
     *     additional line
     *     main line
     * 
     *      *     main line
     *     additional line
     *     additional line
     * 
     *      *     main line
     *     main line
     *     additional line
     * 
     *      *     additional line
     *     main line
     *     additional line
     * 
     * Lines are divided with line separators, mixed Windows and Unix randomly.
     * The last line contains the separator as well.
     * 
     * IP address can be placed to the both main line and additional line.
     * 
     *
     * @param stringLine an empty buffer
     * @param labelsLine an empty buffer
     * @param numSubLines how many sub-lines the {@link LinesWithDateClassification#EXAMPLE_LENGTH_120} will be
     *                    divided to.
     */
    @SuppressWarnings("unchecked")
    private void prepareDataForLearning(StringBuilder stringLine, StringBuilder labelsLine, int numSubLines) {
        Map isMainLine = new HashMap<>(numSubLines);
        for (int i = 0; i < numSubLines; i++) {
            boolean isMain = randomFromZeroToMaxInclusive(9) > 0;
            isMainLine.put(i, isMain);
        }
        if (LinesWithDateClassification.EXAMPLE_LENGTH_120 % numSubLines != 0) {
            throw new LabelizerRuntimeException("Constant EXAMPLE_LENGTH must be divided by " + numSubLines +
                " without remaining.");
        }
        int subLineLen = LinesWithDateClassification.EXAMPLE_LENGTH_120 / numSubLines;
        boolean dateIsFirst = randomFromZeroToMaxInclusive(9) > 0;
        boolean containsThreadName = randomFromZeroToMaxInclusive(9) > 0;
        boolean containsLogLevel = randomFromZeroToMaxInclusive(19) > 0;
        boolean threadNameBeforeLogLevel = randomFromZeroToMaxInclusive(1) == 0;

        DateExample dateExample = nextDateExample();
        Pair beforeDate = null;
        Pair threadName = null;
        Pair dateLeftBoundary = null;
        Pair dateRightBoundary = null;
        Pair logLevelLeftBoundary = null;
        Pair logLevelRightBoundary = null;
        String dateBoundary = BOUNDARIES.get(randomFromZeroToMaxInclusive(BOUNDARIES.size() - 1));
        String levelBoundary = BOUNDARIES.get(randomFromZeroToMaxInclusive(BOUNDARIES.size() - 1));
        for (int subLineNum = 0; subLineNum < numSubLines; subLineNum++) {
            StringBuilder line = new StringBuilder(subLineLen);
            StringBuilder labels = new StringBuilder(subLineLen);

            String newLine = randomFromZeroToMaxInclusive(1) == 0 ? "\r\n" : "\n";
            String newLineLabels = StringUtils.leftPad("", newLine.length(),
                ProbabilityLabel.N_WITHOUT_DATE.getCharacter());
            Pair newLinePair = new Pair<>(newLine, newLineLabels);
            int remaining = subLineLen - newLine.length();

            boolean isMain = isMainLine.get(subLineNum);

            Pair date = nextDate(dateExample, isMain);
            remaining = remaining - date.getLeft().length();

            dateLeftBoundary = dateLeftBoundary(dateLeftBoundary, isMain, dateIsFirst, dateBoundary.substring(0, 1));
            remaining = remaining - dateLeftBoundary.getLeft().length();

            dateRightBoundary = dateRightBoundary(dateRightBoundary, isMain, dateBoundary.substring(1));
            remaining = remaining - dateRightBoundary.getLeft().length();

            beforeDate = getBeforeDate(beforeDate, dateIsFirst, isMain, remaining);
            remaining = remaining - beforeDate.getLeft().length();

            threadName = getThreadName(threadName, containsThreadName, isMain, remaining);
            Pair threadNameCurrent = isMain ? threadName : new Pair<>("", "");
            remaining = remaining - threadNameCurrent.getLeft().length();
            
            Pair logLevel = getLogLevel(isMain, containsLogLevel, remaining);
            remaining = remaining - logLevel.getLeft().length();

            logLevelLeftBoundary = logLevelBoundary(logLevelLeftBoundary, isMain, containsLogLevel, remaining,
                levelBoundary.substring(0, 1));
            remaining = remaining - logLevelLeftBoundary.getLeft().length();
            
            logLevelRightBoundary = logLevelBoundary(logLevelRightBoundary, isMain, containsLogLevel, remaining,
                levelBoundary.substring(1));
            remaining = remaining - logLevelRightBoundary.getLeft().length();

            StringBuilder logLevelFinalString = new StringBuilder();
            StringBuilder logLevelFinalLabels = new StringBuilder();
            appendAll(logLevelFinalString, logLevelFinalLabels, logLevelLeftBoundary, logLevel, logLevelRightBoundary);
            Pair logLevelFinal = new Pair<>(logLevelFinalString.toString(), logLevelFinalLabels.toString());

            boolean generateIpAddress = randomFromZeroToMaxInclusive(4) == 0;
            Pair ipAddress = getIpAddress(generateIpAddress, remaining);
            remaining = remaining - ipAddress.getLeft().length();
            
            appendAll(line, labels, beforeDate, dateLeftBoundary, date, dateRightBoundary);
            
            List> subFields;
            
            if (threadNameBeforeLogLevel) {
                subFields = Arrays.asList(threadNameCurrent, logLevelFinal, ipAddress);
            } else {
                subFields = Arrays.asList(logLevelFinal, threadNameCurrent, ipAddress);
            }
            
            int fillersLen = remaining / subFields.size();
            if (fillersLen < 0) {
                fillersLen = 0;
            }

            applyFilling(line, labels, remaining, subFields, fillersLen);

            int lenWithNewLine = line.length() + newLine.length();
            if (lenWithNewLine > subLineLen) {
                line.setLength(subLineLen - newLine.length());
                labels.setLength(subLineLen - newLine.length());
            } else {
                Pair lastFiller = lineFiller.generateFiller(subLineLen - lenWithNewLine);
                appendAll(line, labels, lastFiller);
            }
            appendAll(line, labels, newLinePair);
            
            stringLine.append(line);
            labelsLine.append(labels);
        }

        validateLength(stringLine, labelsLine);
    }

    @SuppressWarnings("unchecked")
    private void applyFilling(StringBuilder line, StringBuilder labels, int remaining, List> subFields, int fillersLen) {
        for (Pair pair : subFields) {
            Pair filler = lineFiller.generateFiller(fillersLen);
            remaining = remaining - filler.getFirst().length();

            appendAll(line, labels, filler, pair);
        }
    }

    private void validateLength(StringBuilder stringLine, StringBuilder labelsLine) {
        if (stringLine.length() != LinesWithDateClassification.EXAMPLE_LENGTH_120) {
            throw new LabelizerRuntimeException("StringLine length " + stringLine.length() +
                " not equals with " + LinesWithDateClassification.EXAMPLE_LENGTH_120 + ", " +
                "stringLine: " + stringLine);
        }

        if (labelsLine.length() != LinesWithDateClassification.EXAMPLE_LENGTH_120) {
            throw new LabelizerRuntimeException("LabelsLine length " + labelsLine.length() +
                " not equals with " + LinesWithDateClassification.EXAMPLE_LENGTH_120 + ", " +
                "labelsLine: " + labelsLine);
        }
    }

    private Pair dateRightBoundary(Pair dateRightBoundary, boolean isMain,
                                                   String boundary) {
        if (dateRightBoundary != null) {
            return dateRightBoundary;
        }
        if (!isMain) {
            return new Pair<>("", "");
        }
        String labels = StringUtils.leftPad("", boundary.length(), ProbabilityLabel.N_WITHOUT_DATE.getString());
        return new Pair<>(boundary, labels);
    }

    private Pair dateLeftBoundary(Pair dateLeftBoundary, boolean isMain,
                                                  boolean dateIsFirst, String boundary) {
        if (dateLeftBoundary != null) {
            return dateLeftBoundary;
        }
        if (!isMain || dateIsFirst) {
            return new Pair<>("", "");
        }
        String labels = StringUtils.leftPad("", boundary.length(), ProbabilityLabel.N_WITHOUT_DATE.getString());
        return new Pair<>(boundary, labels);
    }

    @SuppressWarnings("unchecked")
    private void appendAll(StringBuilder line, StringBuilder labels, Pair... pairs) {
        for (Pair pair : pairs) {
            line.append(pair.getKey());
            labels.append(pair.getValue());
        }
    }

    private Pair getIpAddress(boolean generateIpAddress, int remaining) {
        if (!generateIpAddress) {
            return new Pair<>("", "");
        }
        String boundaries = BOUNDARIES.get(randomFromZeroToMaxInclusive(BOUNDARIES.size() - 1));
        String ip = IpGenerator.randomIp();
        if (boundaries.length() + ip.length() > remaining) {
            return new Pair<>("", "");
        }
        String result = boundaries.substring(0, 1) + ip + boundaries.substring(1);
        String labels = ProbabilityLabel.N_WITHOUT_DATE.getString() +
            StringUtils.leftPad("", ip.length(), ProbabilityLabel.I_IP_ADDRESS_AND_PORT.getCharacter()) +
            ProbabilityLabel.N_WITHOUT_DATE.getString();
        return new Pair<>(result, labels);
    }

    private Pair getLogLevel(boolean isMain, boolean containsLogLevel, int remaining) {
        if (!isMain || !containsLogLevel) {
            return new Pair<>("", "");
        }
        String level = LOG_LEVELS.get(randomFromZeroToMaxInclusive(LOG_LEVELS.size() - 1));
        if (remaining < level.length()) {
            return new Pair<>("", "");
        }
        String labels = StringUtils.leftPad("", level.length(), ProbabilityLabel.L_LOG_LEVEL.getString());
        return new Pair<>(level, labels);
    }

    private Pair logLevelBoundary(Pair logLevelLeftBoundary, boolean isMain,
                                                  boolean containsLogLevel, int remaining, String boundary) {
        if (logLevelLeftBoundary != null) {
            return logLevelLeftBoundary;
        }
        if (!isMain || !containsLogLevel || remaining < boundary.length()) {
            return new Pair<>("", "");
        }
        String labels = StringUtils.leftPad("", boundary.length(), ProbabilityLabel.N_WITHOUT_DATE.getString());
        return new Pair<>(boundary, labels);
    }

    private Pair getThreadName(Pair threadName, boolean containsThreadName, boolean isMain, int remaining) {
        if (threadName != null) {
            return threadName;
        }
        if (!containsThreadName || !isMain) {
            return new Pair<>("", "");
        }
        boolean useThreadSubstring = randomFromZeroToMaxInclusive(9) == 9;
        String threadSubstring = "";
        boolean before = randomFromZeroToMaxInclusive(1) == 1;
        if (useThreadSubstring) {
            String separator = SEPARATORS.get(randomFromZeroToMaxInclusive(SEPARATORS.size() - 1));
            threadSubstring = THREAD_COMMON_NAMES.get(randomFromZeroToMaxInclusive(THREAD_COMMON_NAMES.size() - 1));
            if (before) {
                threadSubstring = threadSubstring + separator;
            } else {
                threadSubstring = separator + threadSubstring;
            }
        }
        int randomLen = getRandom(3, 15);
        int length = Math.min(remaining - threadSubstring.length(), randomLen + threadSubstring.length());
        if (length < 1) {
            return new Pair<>("", "");
        }
        String filler = lineFiller.generateFiller(length).getKey();
        String threadNameWithoutBoundaries;
        if (before) {
            threadNameWithoutBoundaries = threadSubstring + filler;
        } else {
            threadNameWithoutBoundaries = filler + threadSubstring;
        }

        String boundary = BOUNDARIES.get(randomFromZeroToMaxInclusive(BOUNDARIES.size() - 1));
        String threadNameFinal = boundary.substring(0, 1) + threadNameWithoutBoundaries + boundary.substring(1);
        String labels = ProbabilityLabel.N_WITHOUT_DATE.getString() +
            StringUtils.leftPad("", threadNameWithoutBoundaries.length(), ProbabilityLabel.T_THREAD.getCharacter()) +
            ProbabilityLabel.N_WITHOUT_DATE.getString();
        return new Pair<>(threadNameFinal, labels);
    }

    private Pair getBeforeDate(Pair beforeDate, boolean dateIsFirst, boolean isMain, int remaining) {
        if (beforeDate != null) {
            return beforeDate;
        }
        if (!isMain || dateIsFirst) {
            return new Pair<>("", "");
        }
        if (remaining < 1) {
            remaining = 1;
        }
        int fillersLen = randomFromZeroToMaxInclusive(Math.min(19, remaining - 1));
        return lineFiller.generateFiller(fillersLen);
    }

    private Pair nextDate(DateExample dateExample, boolean isMain) {
        if (!isMain) {
            return new Pair<>("", "");
        }
        Pair result = new Pair<>(dateExample.getSource(), dateExample.getLabels());
        
        // prepare the data for next invocation
        Date date = dateExample.getDate();
        String pattern = dateExample.getPattern();
        Locale locale = dateExample.getLocale();
        TimeZone timeZone = dateExample.getTimeZone();
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat(pattern, locale);
        simpleDateFormat.setTimeZone(timeZone);
        int oneSecond = 1000;
        int randomAddition = randomFromZeroToMaxInclusive(oneSecond);
        date.setTime(date.getTime() + randomAddition);
        String dateString = simpleDateFormat.format(date);
        GregorianCalendar gregorianCalendar = new GregorianCalendar();
        gregorianCalendar.setTime(date);
        String labels = TrainingDataGenerator.findLabels(dateString, pattern, locale, gregorianCalendar, timeZone);
        dateExample.setDate(date);
        dateExample.setSource(dateString);
        dateExample.setLabels(labels);
        
        return result;
    }

    /**
     * @return A {@link DateExample} from {@link #dateExamples} or 'null'. The {@link #dateExamples} is filled with
     * the {@link PagePatternRepository#getNotTrainedPattern()} method.
     */
    private DateExample nextDateExample() {
        if (dateExamples.isEmpty()) {
            PagePatternRepository pagePatternRepository = PagePatternRepository.getInstance();
            PagePattern pagePattern = pagePatternRepository.getNotTrainedPattern();
            if (lastPagePattern != null) {
                lastPagePattern.setTrained(true);
                pagePatternRepository.save(lastPagePattern);
                patternsPassed = patternsCount - pagePatternRepository.countNotTrainedPatterns();
            }
            lastPagePattern = pagePattern;
            dateExamples.addAll(TrainingDataGenerator.generateDates(pagePattern, NUM_EXAMPLES_OF_DATE_PATTERN_100));
            if (dateExamples.isEmpty()) {
                logger.info("List of dateExamples is empty. Last PagePattern: {}", lastPagePattern);
                return nextDateExample();
            }
        }
        return dateExamples.remove(0);
    }

    /**
     * @return Size of {@link #intToCharMap}.
     */
    public int inputColumns() {
        return intToCharMap.size();
    }

    /**
     * @return Number of {@link ProbabilityLabel}s.
     */
    public int totalOutcomes() {
        return ProbabilityLabel.values().length;
    }

    public void reset() {
        PagePatternRepository.getInstance().resetTrained();
        patternsPassed = 0;
        patternsCount = PagePatternRepository.getInstance().countNotTrainedPatterns();
        lastPagePattern = null;
    }

    public boolean resetSupported() {
        return true;
    }

    @Override
    public boolean asyncSupported() {
        return true;
    }

    @Override
    public MultiDataSetPreProcessor getPreProcessor() {
        throw new UnsupportedOperationException(NOT_IMPLEMENTED);
    }

    @Override
    public void remove() {
        throw new UnsupportedOperationException();
    }

    @Override
    public void forEachRemaining(Consumer action) {
        throw new UnsupportedOperationException(NOT_IMPLEMENTED);
    }

    private static int randomFromZeroToMaxInclusive(int max) {
        if (max <= 0) {
            return 0;
        }
        return getRandom(0, max + 1);
    }

    private static int getRandom(int minInclusive, int maxExclusive) {
        if (minInclusive >= maxExclusive) {
            return minInclusive;
        }
        return ThreadLocalRandom.current().nextInt(minInclusive, maxExclusive);
    }

    /**
     * Parse an input line and mark some digits as {@link ProbabilityLabel#Y_YEAR} (y)
     * and others as {@link ProbabilityLabel#N_WITHOUT_DATE} (n).
     * 

     * Digits are considered as a year (y) when they are a part of a year
     * from 1980 to current year + 1 (next year).
     * 
     * Year can be formatted as 4 digits, for example 2019 or two digits, for example 19.
     *
     * @param line input with (or without) date pattern(s), for example abc 2019.05.10 def
     * @return for example nnnnyyyynyynyynnnn
     */
    public static String yearHintLenient(String line) {
        StringBuilder result = new StringBuilder(line.length());
        StringBuilder context4 = new StringBuilder(4);
        StringBuilder context2 = new StringBuilder(2);
        for (char character : line.toCharArray()) {
            result.append(ProbabilityLabel.N_WITHOUT_DATE.getCharacter());
            if (Character.isDigit(character)) {
                processDigit(result, context4, context2, character);
            } else {
                context4.setLength(0);
                context2.setLength(0);
            }
        }
        return result.toString();
    }

    private static void processDigit(StringBuilder result, StringBuilder context4, StringBuilder context2,
                                     char character) {
        context4.append(character);
        context2.append(character);

        if (context4.length() == 4) {
            if (isDate(Integer.parseInt(context4.toString()))) {
                writeToResult(result, 4);
            }
            context4.deleteCharAt(0);
        }
        if (context2.length() == 2) {
            if (isDate(Integer.parseInt(context2.toString()))) {
                writeToResult(result, 2);
            }
            context2.deleteCharAt(0);
        }
    }

    private static void writeToResult(StringBuilder result, int numToAppend) {
        String labels = StringUtils.rightPad("", numToAppend, ProbabilityLabel.Y_YEAR.getCharacter());
        result.setLength(result.length() - numToAppend);
        result.append(labels);
    }

    private static boolean isDate(Integer contextResult) {
        return ((contextResult >= Hint.OLDEST_YEAR && contextResult <= Hint.ACTUAL_YEAR + 1)
            || (contextResult >= Hint.SHORT_ZERO_YEAR && contextResult <= Hint.SHORT_ACTUAL_YEAR + 1)
            || (contextResult >= Hint.SHORT_OLD_YEAR && contextResult < Hint.SHORT_HELPFULL_YEAR));
    }

    public static int countOfSuccessfullyMarkedChars(String recognizedOutput, String expectedOutput) {
        if (recognizedOutput.length() != expectedOutput.length()) {
            throw new LabelizerRuntimeException("RecognizedOutput and expectedOutput length are not equal. " +
                "recognizedOutput: '" + recognizedOutput +
                "', expectedOutput: '" + expectedOutput +
                "'");
        }
        int result = 0;
        for (int index = 0; index < recognizedOutput.length(); index++) {
            if (recognizedOutput.charAt(index) == expectedOutput.charAt(index)) {
                result++;
            }
        }
        return result;
    }

    /**
     * How many lines for training contains the {@link CharIterator}. The value is calculated from {@link #patternsCount}
     * multiplied with {@link #NUM_EXAMPLES_OF_DATE_PATTERN_100}.
     * @return The lines size
     */
    public long trainingDataSetSize() {
        return NUM_EXAMPLES_OF_DATE_PATTERN_100 * patternsCount;
    }

    /**
     * How many lines for training remained in the {@link CharIterator}. The value is calculated from {@link #trainingDataSetSize()}
     * minus {@link #patternsPassed} multiplied with {@link #NUM_EXAMPLES_OF_DATE_PATTERN_100}.
     * @return The remaining lines number
     */
    public long getRemainingDataSetSize() {
        return trainingDataSetSize() - (patternsPassed * NUM_EXAMPLES_OF_DATE_PATTERN_100);
    }

    /**
     * If the {@link #dateExamples} is empty, return 'true'. It means all the examples has been trained and the next
     * {@link PagePattern} can be loaded from the {@link PagePatternRepository#getNotTrainedPattern()} if exists.
     * @return 'true' if the pattern has been trained
     */
    public boolean isPatternTrained() {
        return dateExamples.isEmpty();
    }
}