All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.credibledoc.log.labelizer.iterator.CharIterator Maven / Gradle / Ivy

There is a newer version: 1.0.51
Show newest version
package com.credibledoc.log.labelizer.iterator;

import com.credibledoc.log.labelizer.classifier.LinesWithDateClassification;
import com.credibledoc.log.labelizer.date.DateExample;
import com.credibledoc.log.labelizer.date.ProbabilityLabel;
import com.credibledoc.log.labelizer.exception.LabelizerRuntimeException;
import com.credibledoc.log.labelizer.hint.Hint;
import com.credibledoc.log.labelizer.hint.IpGenerator;
import com.credibledoc.log.labelizer.hint.SimilarityHint;
import com.credibledoc.log.labelizer.pagepattern.PagePattern;
import com.credibledoc.log.labelizer.pagepattern.PagePatternRepository;
import com.credibledoc.log.labelizer.training.TrainingDataGenerator;
import com.google.common.primitives.Chars;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.dataset.api.MultiDataSet;
import org.nd4j.linalg.dataset.api.MultiDataSetPreProcessor;
import org.nd4j.linalg.dataset.api.iterator.MultiDataSetIterator;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.io.ClassPathResource;
import org.nd4j.linalg.primitives.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.TimeZone;
import java.util.concurrent.ThreadLocalRandom;
import java.util.function.Consumer;

/**
 * Provides data for training and testing.
 * 
 * @author Kyrylo Semenko
 */
public class CharIterator implements MultiDataSetIterator {
    public static final String RESOURCES_DIR = "vectors";

    private static final Logger logger = LoggerFactory.getLogger(CharIterator.class);
    private static final String NOT_IMPLEMENTED = "Not implemented";
    public static final String NATIONAL_CHARS_TXT = "chars/nationalChars.txt";
    private static final char ORDERING_FORTRAN = 'f';
    private static final String MINI_BATCH = "MiniBatch: {}: {}";
    private static final int NUM_EXAMPLES_OF_DATE_PATTERN_100 = 100;


    private static final List PUNCTUATIONS = new ArrayList<>(Arrays.asList('!', '&', '(', ')', '?', '-',
        '\\', ',', '.', '\"', ':', ';', ' '));

    private static final List SMALL_LETTERS = listChars('a', 'z');
    private static final List LARGE_LETTERS = listChars('A', 'Z');
    private static final List DIGITS = listChars('0', '9');
    private static final List DIGITS_AND_LETTERS = new ArrayList<>();
    private static final List DIGITS_AND_LETTERS_AND_PUNCTUATIONS = new ArrayList<>();
    private static final List SEPARATORS = new ArrayList<>(Arrays.asList("|", " ", "/", ",", "'", "-", "_"));

    private static final List BOUNDARIES = new ArrayList<>(Arrays.asList(
        // repeated for more probability
        "  ", "  ", "  ", "  ", "  ",
        "{}", "()", "()", "()",
        "//", ",,", "\"\"", "''", "--", "__", "||",
        "[]", "[]", "[]", "[]", "[]"));

    private static final List THREAD_COMMON_NAMES = new ArrayList<>(Arrays.asList("thread", "main", "worker", "job", "pool",
        "local", "exec", "Main"));

    private static final List LOG_LEVELS = new ArrayList<>(Arrays.asList("ALL", "TRACE", "DEBUG", "INFO", "WARN", "WARNING",
        "ERROR", "SEVERE", "FATAL", "CONFIG", "FINE", "FINER", "FINEST", "CRITICAL", "VERBOSE", "Trace", "Debug", "Info", "Warn", "Error"));
    private static final int KEY_FOR_MISSED_CHARS = 0;
    public static final String NEW_CHARS_TXT = "chars/newChars.txt";


    /**
     * Maps each character to an index in the input/output. These characters is used in train and test data.
     * 

* Key is an order number and value is a character. */ private Map intToCharMap; /** * The examples of a date string where individual parts are marked with labels. * This list contains max {@link #NUM_EXAMPLES_OF_DATE_PATTERN_100} elements and the new elements where appends * continuously until some patterns exists in the database. */ private transient List dateExamples = new ArrayList<>(); /** * Provides characters without labels for filling out of gaps between the labeled data. */ private transient LineFiller lineFiller; /** * Length of each example/minibatch (number of characters) */ private int exampleLength; /** * Size of each minibatch (number of examples) */ private int miniBatchSize; /** * The current trained {@link PagePattern}. */ private transient PagePattern lastPagePattern; /** * How many {@link PagePattern}s for a training the database contains before start of the training. */ private long patternsCount; /** * How many {@link PagePattern}s has been trained. */ private long patternsPassed = 0; /** * Resource files directory. */ private final String resourcesDirPath; static { DIGITS_AND_LETTERS.addAll(LARGE_LETTERS); DIGITS_AND_LETTERS.addAll(SMALL_LETTERS); DIGITS_AND_LETTERS.addAll(DIGITS); DIGITS_AND_LETTERS_AND_PUNCTUATIONS.addAll(DIGITS_AND_LETTERS); DIGITS_AND_LETTERS_AND_PUNCTUATIONS.addAll(PUNCTUATIONS); DIGITS_AND_LETTERS_AND_PUNCTUATIONS.addAll(readNationalChars()); } private static Collection readNationalChars() { try { String chars = getNationalCharsFromFile(); return new ArrayList<>(Chars.asList(chars.toCharArray())); } catch (Exception e) { throw new LabelizerRuntimeException(e); } } private static String getNationalCharsFromFile() throws IOException { ClassPathResource resource = new ClassPathResource(CharIterator.RESOURCES_DIR + "/" + NATIONAL_CHARS_TXT); File file = resource.getFile(); if (!file.exists()) { throw new LabelizerRuntimeException("File not found: '" + file.getAbsolutePath() + "'"); } return new String(Files.readAllBytes(file.toPath())); } /** * @param resourcesDirPath Path to text file to use for generating samples * @param charset Encoding of the text file(s). Can try Charset.defaultCharset() * @param miniBatchSize Number of examples per mini-batch * @param exampleLength Number of characters in each input/output vector * @throws IOException If text file cannot be loaded */ public CharIterator(String resourcesDirPath, Charset charset, int miniBatchSize, int exampleLength) throws IOException { if (!new File(resourcesDirPath).exists()) { throw new IOException("Could not access file (does not exist): " + resourcesDirPath); } if (miniBatchSize <= 0) { throw new IllegalArgumentException("Invalid miniBatchSize (must be > 0)"); } this.resourcesDirPath = resourcesDirPath; this.exampleLength = exampleLength; this.miniBatchSize = miniBatchSize; //Store valid characters is a map for later use in vectorization initCharToIdxMap(); lineFiller = new LineFiller(resourcesDirPath, charset); patternsCount = PagePatternRepository.getInstance().countNotTrainedPatterns(); } public List readLinesFromFolder(String resourcesDirPath, Charset textFileEncoding, String folderName) throws IOException { File dateDir = new File(resourcesDirPath, folderName); Collection dateFiles = FileUtils.listFiles(dateDir, null, false); List exampleLines = new ArrayList<>(); for (File file : dateFiles) { exampleLines.addAll(Files.readAllLines(file.toPath(), textFileEncoding)); } checkMissingChars(resourcesDirPath, exampleLines); return exampleLines; } private void checkMissingChars(String resourcesDirPath, List exampleLines) throws IOException { List missingChars = new ArrayList<>(); for (String line : exampleLines) { char[] thisLine = line.toCharArray(); for (char nextChar : thisLine) { if (!intToCharMap.containsValue(nextChar) && !missingChars.contains(nextChar)) { missingChars.add(nextChar); } } } if (!missingChars.isEmpty()) { StringBuilder stringBuilder = new StringBuilder(); for (Character next : missingChars) { stringBuilder.append(next); } File charsFile = new File(resourcesDirPath, NATIONAL_CHARS_TXT); logger.info("File will be created: '{}'", charsFile.getAbsolutePath()); if (!charsFile.getParentFile().mkdirs()) { logger.info("Not created '{}'", charsFile.getParentFile().getAbsolutePath()); } String existingChars = getNationalCharsFromFile(); try (BufferedWriter writer = new BufferedWriter(new FileWriter(charsFile))) { writer.write(existingChars); writer.write(stringBuilder.toString()); } throw new LabelizerRuntimeException("missingChars:" + stringBuilder.toString()); } } private void initCharToIdxMap() { intToCharMap = new HashMap<>(); char[] chars = getCharacters(); for (int i = 0; i < chars.length; i++) { intToCharMap.put(i, chars[i]); } String escaped = intToCharMap.toString() .replaceAll("(\\t)+", "\\\\t") .replaceAll("(\\r)+", "\\\\r") .replaceAll("(\\n)+", "\\\\n"); logger.info("All used characters in the charToIdxMap: {}", escaped); } /** * A minimal character set, with a-z, A-Z, 0-9 and common punctuation etc. * As per getMinimalCharacterSet(), but with a few extra characters. * @return Char array of available characters for the encoding */ private static char[] getCharacters() { List validChars = new ArrayList<>(DIGITS_AND_LETTERS_AND_PUNCTUATIONS); Character[] temp = {'\n', '\t', '\r'}; validChars.addAll(Arrays.asList(temp)); Character[] additionalChars = {'@', '#', '$', '%', '^', '*', '{', '}', '[', ']', '/', '+', '_', '\\', '|', '<', '>', '='}; validChars.addAll(Arrays.asList(additionalChars)); char[] out = new char[validChars.size()]; int i = 0; for (Character c : validChars) out[i++] = c; return out; } private static List listChars(char from, char to) { List result = new ArrayList<>(); for (char c = from; c <= to; c++) { result.add(c); } return result; } public int convertCharacterToIndex(char character) { for (Map.Entry entry : intToCharMap.entrySet()) { if (character == entry.getValue()) { return entry.getKey(); } } File charsFile = new File(resourcesDirPath + "/", NEW_CHARS_TXT); if (!charsFile.exists()) { try { Files.createFile(charsFile.toPath()); } catch (IOException e) { throw new LabelizerRuntimeException(e); } } String chars; try { chars = new String(Files.readAllBytes(charsFile.toPath())); } catch (Exception e) { throw new LabelizerRuntimeException(e); } if (!chars.contains(Character.toString(character))) { try (FileWriter fileWriter = new FileWriter(charsFile, true)) { String message = "Cannot find index for character: '" + character + "'. It will be write to the file '" + charsFile.getAbsolutePath() + "'. The default character with key '" + KEY_FOR_MISSED_CHARS + "' will be used. Please merge the " + "file content to the '" + NATIONAL_CHARS_TXT + "' file."; logger.info(message); fileWriter.append(character); } catch (Exception e) { throw new LabelizerRuntimeException(e); } } return KEY_FOR_MISSED_CHARS; } public boolean hasNext() { return hasMoreExamples(); } private boolean hasMoreExamples() { return !dateExamples.isEmpty() || patternsCount > patternsPassed; } public MultiDataSet next() { try { return next(miniBatchSize); } catch (Exception e) { logger.error(e.getMessage(), e); throw new NoSuchElementException(e.getMessage()); } } public MultiDataSet next(int miniBatches) { if (!hasMoreExamples()) { throw new NoSuchElementException(); } List> examples = new ArrayList<>(miniBatches); // Create pairs where first row is examples and second row is labels, for example // 1) 28.2.2019 11:45:00.123 1234567654 abcde 28.2.2018 11:46:00.124... // 2) ddddddddddddddddddddddwwwwwwwwwwwwwwwwwwdddddddddddddddddddddd... // Every pair has the same length and they equal with exampleLength for (int i = 0; i < miniBatches && hasMoreExamples(); i++) { StringBuilder left = new StringBuilder(exampleLength); StringBuilder right = new StringBuilder(exampleLength); int randomIndex = randomFromZeroToMaxInclusive(LinesWithDateClassification.NUM_SUB_LINES.size() - 1); int numSubLines = LinesWithDateClassification.NUM_SUB_LINES.get(randomIndex); prepareDataForLearning(left, right, numSubLines); String exampleString = left.toString(); String labelsString = right.toString(); Pair pair = new Pair<>(exampleString, labelsString); examples.add(pair); } int currMinibatchSize = Math.min(miniBatches, examples.size()); //Allocate space: //Note the order here: // dimension 0 = number of examples in minibatch // dimension 1 = size of each vector (i.e., number of characters or number of labels) // dimension 2 = length of each time series/example // Why 'f' order here? See https://jrmerwin.github.io/deeplearning4j-docs/usingrnns.html, // section "Alternative: Implementing a custom DataSetIterator" INDArray input = Nd4j.create(new int[]{currMinibatchSize, intToCharMap.size(), exampleLength}, ORDERING_FORTRAN); // NOSONAR INDArray inputHint = Nd4j.create(new int[]{currMinibatchSize, 2, exampleLength}, ORDERING_FORTRAN); // NOSONAR INDArray labels = Nd4j.create(new int[]{currMinibatchSize, ProbabilityLabel.values().length, exampleLength}, ORDERING_FORTRAN); // NOSONAR for (int miniBatchIndex = 0; miniBatchIndex < currMinibatchSize; miniBatchIndex++) { String stringLine = examples.get(miniBatchIndex).getLeft(); String hintLine = SimilarityHint.linesSimilarityMarker(stringLine); String labelsLine = examples.get(miniBatchIndex).getRight(); if (stringLine.length() != labelsLine.length()) { String lines = "\nLength: '" + stringLine.length() + "', line : '" + stringLine + "'" + "\nLength: '" + labelsLine.length() + "', labels: '" + labelsLine + "'"; throw new LabelizerRuntimeException("Line and labeled line should have same lengths.\n" + lines); } logger.info(MINI_BATCH, miniBatchIndex, stringLine); logger.info(MINI_BATCH, miniBatchIndex, labelsLine); logger.info(MINI_BATCH, miniBatchIndex, hintLine); logger.info(""); for (int charIndex = 0; charIndex < stringLine.length(); charIndex++) { char exampleChar = stringLine.charAt(charIndex); char labelChar = labelsLine.charAt(charIndex); char hintChar = hintLine.charAt(charIndex); int exampleCharIndex = convertCharacterToIndex(exampleChar); int hintCharIndex = hintChar == 'n' ? 0 : 1; int labelCharIndex = ProbabilityLabel.findIndex(labelChar); input.putScalar(new int[]{miniBatchIndex, exampleCharIndex, charIndex}, 1.0); inputHint.putScalar(new int[]{miniBatchIndex, hintCharIndex, charIndex}, 1.0); labels.putScalar(new int[]{miniBatchIndex, labelCharIndex, charIndex}, 1.0); } } return new org.nd4j.linalg.dataset.MultiDataSet( new INDArray[]{input, inputHint}, new INDArray[]{labels} ); } @Override public void setPreProcessor(MultiDataSetPreProcessor preProcessor) { throw new UnsupportedOperationException(NOT_IMPLEMENTED); } /** * Fill the buffers with data. These data will be used for training. *

* StringLine contains several sub-lines divided with line separators. *

* Sub-line can be a main line or additional line. The main line contains date, thread name and log level, * for example *

     *     01.11.2019;00:00:01.095 DEBUG [DefaultQuartzScheduler_Worker-9] [  /   ] - Main... (main line)
     * 
. *

* The additional line follows the main line, for example *

     *     01.11.2019;12:52:09.721 ERROR [DefaultQuartzScheduler_Worker-3] [  /   ] - Error in... (main line)
     *        at com.bla.security(Class.java:65) (additional line)
     * 
* Examples *
     *     main line
     *     additional line
     *     main line
     * 
*
     *     main line
     *     additional line
     *     additional line
     * 
*
     *     main line
     *     main line
     *     additional line
     * 
*
     *     additional line
     *     main line
     *     additional line
     * 
* Lines are divided with line separators, mixed Windows and Unix randomly. * The last line contains the separator as well. *

* IP address can be placed to the both main line and additional line. * * * @param stringLine an empty buffer * @param labelsLine an empty buffer * @param numSubLines how many sub-lines the {@link LinesWithDateClassification#EXAMPLE_LENGTH_120} will be * divided to. */ @SuppressWarnings("unchecked") private void prepareDataForLearning(StringBuilder stringLine, StringBuilder labelsLine, int numSubLines) { Map isMainLine = new HashMap<>(numSubLines); for (int i = 0; i < numSubLines; i++) { boolean isMain = randomFromZeroToMaxInclusive(9) > 0; isMainLine.put(i, isMain); } if (LinesWithDateClassification.EXAMPLE_LENGTH_120 % numSubLines != 0) { throw new LabelizerRuntimeException("Constant EXAMPLE_LENGTH must be divided by " + numSubLines + " without remaining."); } int subLineLen = LinesWithDateClassification.EXAMPLE_LENGTH_120 / numSubLines; boolean dateIsFirst = randomFromZeroToMaxInclusive(9) > 0; boolean containsThreadName = randomFromZeroToMaxInclusive(9) > 0; boolean containsLogLevel = randomFromZeroToMaxInclusive(19) > 0; boolean threadNameBeforeLogLevel = randomFromZeroToMaxInclusive(1) == 0; DateExample dateExample = nextDateExample(); Pair beforeDate = null; Pair threadName = null; Pair dateLeftBoundary = null; Pair dateRightBoundary = null; Pair logLevelLeftBoundary = null; Pair logLevelRightBoundary = null; String dateBoundary = BOUNDARIES.get(randomFromZeroToMaxInclusive(BOUNDARIES.size() - 1)); String levelBoundary = BOUNDARIES.get(randomFromZeroToMaxInclusive(BOUNDARIES.size() - 1)); for (int subLineNum = 0; subLineNum < numSubLines; subLineNum++) { StringBuilder line = new StringBuilder(subLineLen); StringBuilder labels = new StringBuilder(subLineLen); String newLine = randomFromZeroToMaxInclusive(1) == 0 ? "\r\n" : "\n"; String newLineLabels = StringUtils.leftPad("", newLine.length(), ProbabilityLabel.N_WITHOUT_DATE.getCharacter()); Pair newLinePair = new Pair<>(newLine, newLineLabels); int remaining = subLineLen - newLine.length(); boolean isMain = isMainLine.get(subLineNum); Pair date = nextDate(dateExample, isMain); remaining = remaining - date.getLeft().length(); dateLeftBoundary = dateLeftBoundary(dateLeftBoundary, isMain, dateIsFirst, dateBoundary.substring(0, 1)); remaining = remaining - dateLeftBoundary.getLeft().length(); dateRightBoundary = dateRightBoundary(dateRightBoundary, isMain, dateBoundary.substring(1)); remaining = remaining - dateRightBoundary.getLeft().length(); beforeDate = getBeforeDate(beforeDate, dateIsFirst, isMain, remaining); remaining = remaining - beforeDate.getLeft().length(); threadName = getThreadName(threadName, containsThreadName, isMain, remaining); Pair threadNameCurrent = isMain ? threadName : new Pair<>("", ""); remaining = remaining - threadNameCurrent.getLeft().length(); Pair logLevel = getLogLevel(isMain, containsLogLevel, remaining); remaining = remaining - logLevel.getLeft().length(); logLevelLeftBoundary = logLevelBoundary(logLevelLeftBoundary, isMain, containsLogLevel, remaining, levelBoundary.substring(0, 1)); remaining = remaining - logLevelLeftBoundary.getLeft().length(); logLevelRightBoundary = logLevelBoundary(logLevelRightBoundary, isMain, containsLogLevel, remaining, levelBoundary.substring(1)); remaining = remaining - logLevelRightBoundary.getLeft().length(); StringBuilder logLevelFinalString = new StringBuilder(); StringBuilder logLevelFinalLabels = new StringBuilder(); appendAll(logLevelFinalString, logLevelFinalLabels, logLevelLeftBoundary, logLevel, logLevelRightBoundary); Pair logLevelFinal = new Pair<>(logLevelFinalString.toString(), logLevelFinalLabels.toString()); boolean generateIpAddress = randomFromZeroToMaxInclusive(4) == 0; Pair ipAddress = getIpAddress(generateIpAddress, remaining); remaining = remaining - ipAddress.getLeft().length(); appendAll(line, labels, beforeDate, dateLeftBoundary, date, dateRightBoundary); List> subFields; if (threadNameBeforeLogLevel) { subFields = Arrays.asList(threadNameCurrent, logLevelFinal, ipAddress); } else { subFields = Arrays.asList(logLevelFinal, threadNameCurrent, ipAddress); } int fillersLen = remaining / subFields.size(); if (fillersLen < 0) { fillersLen = 0; } applyFilling(line, labels, remaining, subFields, fillersLen); int lenWithNewLine = line.length() + newLine.length(); if (lenWithNewLine > subLineLen) { line.setLength(subLineLen - newLine.length()); labels.setLength(subLineLen - newLine.length()); } else { Pair lastFiller = lineFiller.generateFiller(subLineLen - lenWithNewLine); appendAll(line, labels, lastFiller); } appendAll(line, labels, newLinePair); stringLine.append(line); labelsLine.append(labels); } validateLength(stringLine, labelsLine); } @SuppressWarnings("unchecked") private void applyFilling(StringBuilder line, StringBuilder labels, int remaining, List> subFields, int fillersLen) { for (Pair pair : subFields) { Pair filler = lineFiller.generateFiller(fillersLen); remaining = remaining - filler.getFirst().length(); appendAll(line, labels, filler, pair); } } private void validateLength(StringBuilder stringLine, StringBuilder labelsLine) { if (stringLine.length() != LinesWithDateClassification.EXAMPLE_LENGTH_120) { throw new LabelizerRuntimeException("StringLine length " + stringLine.length() + " not equals with " + LinesWithDateClassification.EXAMPLE_LENGTH_120 + ", " + "stringLine: " + stringLine); } if (labelsLine.length() != LinesWithDateClassification.EXAMPLE_LENGTH_120) { throw new LabelizerRuntimeException("LabelsLine length " + labelsLine.length() + " not equals with " + LinesWithDateClassification.EXAMPLE_LENGTH_120 + ", " + "labelsLine: " + labelsLine); } } private Pair dateRightBoundary(Pair dateRightBoundary, boolean isMain, String boundary) { if (dateRightBoundary != null) { return dateRightBoundary; } if (!isMain) { return new Pair<>("", ""); } String labels = StringUtils.leftPad("", boundary.length(), ProbabilityLabel.N_WITHOUT_DATE.getString()); return new Pair<>(boundary, labels); } private Pair dateLeftBoundary(Pair dateLeftBoundary, boolean isMain, boolean dateIsFirst, String boundary) { if (dateLeftBoundary != null) { return dateLeftBoundary; } if (!isMain || dateIsFirst) { return new Pair<>("", ""); } String labels = StringUtils.leftPad("", boundary.length(), ProbabilityLabel.N_WITHOUT_DATE.getString()); return new Pair<>(boundary, labels); } @SuppressWarnings("unchecked") private void appendAll(StringBuilder line, StringBuilder labels, Pair... pairs) { for (Pair pair : pairs) { line.append(pair.getKey()); labels.append(pair.getValue()); } } private Pair getIpAddress(boolean generateIpAddress, int remaining) { if (!generateIpAddress) { return new Pair<>("", ""); } String boundaries = BOUNDARIES.get(randomFromZeroToMaxInclusive(BOUNDARIES.size() - 1)); String ip = IpGenerator.randomIp(); if (boundaries.length() + ip.length() > remaining) { return new Pair<>("", ""); } String result = boundaries.substring(0, 1) + ip + boundaries.substring(1); String labels = ProbabilityLabel.N_WITHOUT_DATE.getString() + StringUtils.leftPad("", ip.length(), ProbabilityLabel.I_IP_ADDRESS_AND_PORT.getCharacter()) + ProbabilityLabel.N_WITHOUT_DATE.getString(); return new Pair<>(result, labels); } private Pair getLogLevel(boolean isMain, boolean containsLogLevel, int remaining) { if (!isMain || !containsLogLevel) { return new Pair<>("", ""); } String level = LOG_LEVELS.get(randomFromZeroToMaxInclusive(LOG_LEVELS.size() - 1)); if (remaining < level.length()) { return new Pair<>("", ""); } String labels = StringUtils.leftPad("", level.length(), ProbabilityLabel.L_LOG_LEVEL.getString()); return new Pair<>(level, labels); } private Pair logLevelBoundary(Pair logLevelLeftBoundary, boolean isMain, boolean containsLogLevel, int remaining, String boundary) { if (logLevelLeftBoundary != null) { return logLevelLeftBoundary; } if (!isMain || !containsLogLevel || remaining < boundary.length()) { return new Pair<>("", ""); } String labels = StringUtils.leftPad("", boundary.length(), ProbabilityLabel.N_WITHOUT_DATE.getString()); return new Pair<>(boundary, labels); } private Pair getThreadName(Pair threadName, boolean containsThreadName, boolean isMain, int remaining) { if (threadName != null) { return threadName; } if (!containsThreadName || !isMain) { return new Pair<>("", ""); } boolean useThreadSubstring = randomFromZeroToMaxInclusive(9) == 9; String threadSubstring = ""; boolean before = randomFromZeroToMaxInclusive(1) == 1; if (useThreadSubstring) { String separator = SEPARATORS.get(randomFromZeroToMaxInclusive(SEPARATORS.size() - 1)); threadSubstring = THREAD_COMMON_NAMES.get(randomFromZeroToMaxInclusive(THREAD_COMMON_NAMES.size() - 1)); if (before) { threadSubstring = threadSubstring + separator; } else { threadSubstring = separator + threadSubstring; } } int randomLen = getRandom(3, 15); int length = Math.min(remaining - threadSubstring.length(), randomLen + threadSubstring.length()); if (length < 1) { return new Pair<>("", ""); } String filler = lineFiller.generateFiller(length).getKey(); String threadNameWithoutBoundaries; if (before) { threadNameWithoutBoundaries = threadSubstring + filler; } else { threadNameWithoutBoundaries = filler + threadSubstring; } String boundary = BOUNDARIES.get(randomFromZeroToMaxInclusive(BOUNDARIES.size() - 1)); String threadNameFinal = boundary.substring(0, 1) + threadNameWithoutBoundaries + boundary.substring(1); String labels = ProbabilityLabel.N_WITHOUT_DATE.getString() + StringUtils.leftPad("", threadNameWithoutBoundaries.length(), ProbabilityLabel.T_THREAD.getCharacter()) + ProbabilityLabel.N_WITHOUT_DATE.getString(); return new Pair<>(threadNameFinal, labels); } private Pair getBeforeDate(Pair beforeDate, boolean dateIsFirst, boolean isMain, int remaining) { if (beforeDate != null) { return beforeDate; } if (!isMain || dateIsFirst) { return new Pair<>("", ""); } if (remaining < 1) { remaining = 1; } int fillersLen = randomFromZeroToMaxInclusive(Math.min(19, remaining - 1)); return lineFiller.generateFiller(fillersLen); } private Pair nextDate(DateExample dateExample, boolean isMain) { if (!isMain) { return new Pair<>("", ""); } Pair result = new Pair<>(dateExample.getSource(), dateExample.getLabels()); // prepare the data for next invocation Date date = dateExample.getDate(); String pattern = dateExample.getPattern(); Locale locale = dateExample.getLocale(); TimeZone timeZone = dateExample.getTimeZone(); SimpleDateFormat simpleDateFormat = new SimpleDateFormat(pattern, locale); simpleDateFormat.setTimeZone(timeZone); int oneSecond = 1000; int randomAddition = randomFromZeroToMaxInclusive(oneSecond); date.setTime(date.getTime() + randomAddition); String dateString = simpleDateFormat.format(date); GregorianCalendar gregorianCalendar = new GregorianCalendar(); gregorianCalendar.setTime(date); String labels = TrainingDataGenerator.findLabels(dateString, pattern, locale, gregorianCalendar, timeZone); dateExample.setDate(date); dateExample.setSource(dateString); dateExample.setLabels(labels); return result; } /** * @return A {@link DateExample} from {@link #dateExamples} or 'null'. The {@link #dateExamples} is filled with * the {@link PagePatternRepository#getNotTrainedPattern()} method. */ private DateExample nextDateExample() { if (dateExamples.isEmpty()) { PagePatternRepository pagePatternRepository = PagePatternRepository.getInstance(); PagePattern pagePattern = pagePatternRepository.getNotTrainedPattern(); if (lastPagePattern != null) { lastPagePattern.setTrained(true); pagePatternRepository.save(lastPagePattern); patternsPassed = patternsCount - pagePatternRepository.countNotTrainedPatterns(); } lastPagePattern = pagePattern; dateExamples.addAll(TrainingDataGenerator.generateDates(pagePattern, NUM_EXAMPLES_OF_DATE_PATTERN_100)); if (dateExamples.isEmpty()) { logger.info("List of dateExamples is empty. Last PagePattern: {}", lastPagePattern); return nextDateExample(); } } return dateExamples.remove(0); } /** * @return Size of {@link #intToCharMap}. */ public int inputColumns() { return intToCharMap.size(); } /** * @return Number of {@link ProbabilityLabel}s. */ public int totalOutcomes() { return ProbabilityLabel.values().length; } public void reset() { PagePatternRepository.getInstance().resetTrained(); patternsPassed = 0; patternsCount = PagePatternRepository.getInstance().countNotTrainedPatterns(); lastPagePattern = null; } public boolean resetSupported() { return true; } @Override public boolean asyncSupported() { return true; } @Override public MultiDataSetPreProcessor getPreProcessor() { throw new UnsupportedOperationException(NOT_IMPLEMENTED); } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public void forEachRemaining(Consumer action) { throw new UnsupportedOperationException(NOT_IMPLEMENTED); } private static int randomFromZeroToMaxInclusive(int max) { if (max <= 0) { return 0; } return getRandom(0, max + 1); } private static int getRandom(int minInclusive, int maxExclusive) { if (minInclusive >= maxExclusive) { return minInclusive; } return ThreadLocalRandom.current().nextInt(minInclusive, maxExclusive); } /** * Parse an input line and mark some digits as {@link ProbabilityLabel#Y_YEAR} (y) * and others as {@link ProbabilityLabel#N_WITHOUT_DATE} (n). *

* Digits are considered as a year (y) when they are a part of a year * from 1980 to current year + 1 (next year). *

* Year can be formatted as 4 digits, for example 2019 or two digits, for example 19. * * @param line input with (or without) date pattern(s), for example abc 2019.05.10 def * @return for example nnnnyyyynyynyynnnn */ public static String yearHintLenient(String line) { StringBuilder result = new StringBuilder(line.length()); StringBuilder context4 = new StringBuilder(4); StringBuilder context2 = new StringBuilder(2); for (char character : line.toCharArray()) { result.append(ProbabilityLabel.N_WITHOUT_DATE.getCharacter()); if (Character.isDigit(character)) { processDigit(result, context4, context2, character); } else { context4.setLength(0); context2.setLength(0); } } return result.toString(); } private static void processDigit(StringBuilder result, StringBuilder context4, StringBuilder context2, char character) { context4.append(character); context2.append(character); if (context4.length() == 4) { if (isDate(Integer.parseInt(context4.toString()))) { writeToResult(result, 4); } context4.deleteCharAt(0); } if (context2.length() == 2) { if (isDate(Integer.parseInt(context2.toString()))) { writeToResult(result, 2); } context2.deleteCharAt(0); } } private static void writeToResult(StringBuilder result, int numToAppend) { String labels = StringUtils.rightPad("", numToAppend, ProbabilityLabel.Y_YEAR.getCharacter()); result.setLength(result.length() - numToAppend); result.append(labels); } private static boolean isDate(Integer contextResult) { return ((contextResult >= Hint.OLDEST_YEAR && contextResult <= Hint.ACTUAL_YEAR + 1) || (contextResult >= Hint.SHORT_ZERO_YEAR && contextResult <= Hint.SHORT_ACTUAL_YEAR + 1) || (contextResult >= Hint.SHORT_OLD_YEAR && contextResult < Hint.SHORT_HELPFULL_YEAR)); } public static int countOfSuccessfullyMarkedChars(String recognizedOutput, String expectedOutput) { if (recognizedOutput.length() != expectedOutput.length()) { throw new LabelizerRuntimeException("RecognizedOutput and expectedOutput length are not equal. " + "recognizedOutput: '" + recognizedOutput + "', expectedOutput: '" + expectedOutput + "'"); } int result = 0; for (int index = 0; index < recognizedOutput.length(); index++) { if (recognizedOutput.charAt(index) == expectedOutput.charAt(index)) { result++; } } return result; } /** * How many lines for training contains the {@link CharIterator}. The value is calculated from {@link #patternsCount} * multiplied with {@link #NUM_EXAMPLES_OF_DATE_PATTERN_100}. * @return The lines size */ public long trainingDataSetSize() { return NUM_EXAMPLES_OF_DATE_PATTERN_100 * patternsCount; } /** * How many lines for training remained in the {@link CharIterator}. The value is calculated from {@link #trainingDataSetSize()} * minus {@link #patternsPassed} multiplied with {@link #NUM_EXAMPLES_OF_DATE_PATTERN_100}. * @return The remaining lines number */ public long getRemainingDataSetSize() { return trainingDataSetSize() - (patternsPassed * NUM_EXAMPLES_OF_DATE_PATTERN_100); } /** * If the {@link #dateExamples} is empty, return 'true'. It means all the examples has been trained and the next * {@link PagePattern} can be loaded from the {@link PagePatternRepository#getNotTrainedPattern()} if exists. * @return 'true' if the pattern has been trained */ public boolean isPatternTrained() { return dateExamples.isEmpty(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy