All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.gwdg.metadataqa.marc.cli.ValidatorCli Maven / Gradle / Ivy

package de.gwdg.metadataqa.marc.cli;

import de.gwdg.metadataqa.marc.CsvUtils;
import de.gwdg.metadataqa.marc.analysis.validator.Validator;
import de.gwdg.metadataqa.marc.analysis.validator.ValidatorConfiguration;
import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
import de.gwdg.metadataqa.marc.cli.parameters.ValidatorParameters;
import de.gwdg.metadataqa.marc.cli.processor.BibliographicInputProcessor;
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
import de.gwdg.metadataqa.marc.model.validation.ValidationError;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorCategory;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorFormatter;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorType;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.marc4j.marc.Record;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;

import static de.gwdg.metadataqa.marc.Utils.*;
import static de.gwdg.metadataqa.marc.model.validation.ValidationErrorFormat.TAB_SEPARATED;

/**
 * usage:
 * java -cp target/metadata-qa-marc-0.1-SNAPSHOT-jar-with-dependencies.jar de.gwdg.metadataqa.marc.cli.Validator [MARC21 file]
 *
 * @author Péter Király 
 */
public class ValidatorCli implements BibliographicInputProcessor, Serializable {

  private static final Logger logger = Logger.getLogger(ValidatorCli.class.getCanonicalName());
  private Options options;

  private final ValidatorParameters parameters;
  private final Map totalRecordCounter = new HashMap<>();
  private final Map totalInstanceCounter = new HashMap<>();
  private final Map categoryRecordCounter = new EnumMap<>(ValidationErrorCategory.class);
  private final Map categoryInstanceCounter = new EnumMap<>(ValidationErrorCategory.class);
  private final Map typeRecordCounter = new EnumMap<>(ValidationErrorType.class);
  private final Map typeInstanceCounter = new EnumMap<>(ValidationErrorType.class);
  private final Map instanceBasedErrorCounter = new HashMap<>();
  private final Map recordBasedErrorCounter = new HashMap<>();
  private final Map hashedIndex = new HashMap<>();
  private final Map> errorCollector = new TreeMap<>();
  private final Map> isbnCollector = new TreeMap<>();
  private final Map> issnCollector = new TreeMap<>();
  private File detailsFile = null;
  private File summaryFile = null;
  private File collectorFile = null;
  private boolean doPrintInProcessRecord = true;
  private boolean readyToProcess;
  private int counter;
  private int numberOfprocessedRecords;
  private char separator;
  private boolean hasSeparator = false;
  private int vErrorId = 1;
  private List allValidationErrors;
  private ValidatorConfiguration validatorConfiguration;

  public ValidatorCli(String[] args) throws ParseException {
    this(new ValidatorParameters(args));
  }

  public ValidatorCli(ValidatorParameters parameters) throws ParseException {
    this.parameters = parameters;
    options = parameters.getOptions();
    readyToProcess = true;
    counter = 0;
    validatorConfiguration = new ValidatorConfiguration()
      .withMarcVersion(parameters.getMarcVersion())
      .withDoSummary(parameters.doSummary())
      .withIgnorableFields(parameters.getIgnorableFields())
      .withIgnorableIssueTypes(parameters.getIgnorableIssueTypes());
  }

  public static void main(String[] args) {
    BibliographicInputProcessor processor = null;
    try {
      processor = new ValidatorCli(args);
    } catch (ParseException e) {
      System.err.println("ERROR. " + e.getLocalizedMessage());
      // processor.printHelp(processor.getParameters().getOptions());
      System.exit(0);
    }
    if (processor.getParameters().getArgs().length < 1) {
      System.err.println("Please provide a MARC file name!");
      processor.printHelp(processor.getParameters().getOptions());
      System.exit(0);
    }
    if (processor.getParameters().doHelp()) {
      processor.printHelp(processor.getParameters().getOptions());
      System.exit(0);
    }
    RecordIterator iterator = new RecordIterator(processor);
    iterator.start();
  }

  public void printHelp(Options opions) {
    HelpFormatter formatter = new HelpFormatter();
    String message = String.format("java -cp metadata-qa-marc.jar %s [options] [file]",
      this.getClass().getCanonicalName());
    formatter.printHelp(message, options);
  }

  @Override
  public ValidatorParameters getParameters() {
    return parameters;
  }

  @Override
  public void beforeIteration() {
    logger.info(parameters.formatParameters());
    if (!parameters.useStandardOutput()) {
      detailsFile = prepareReportFile(parameters.getOutputDir(), parameters.getDetailsFileName());
      logger.info("details output: " + detailsFile.getPath());
      if (parameters.getSummaryFileName() != null) {
        summaryFile = prepareReportFile(parameters.getOutputDir(), parameters.getSummaryFileName());
        logger.info("summary output: " + summaryFile.getPath());

        collectorFile = prepareReportFile(parameters.getOutputDir(), "issue-collector.csv");
        String header = ValidationErrorFormatter.formatHeaderForCollector(
          parameters.getFormat()
        );
        print(collectorFile, header + "\n");

      } else {
        if (parameters.doSummary())
          summaryFile = detailsFile;
      }
    }
    if (parameters.doDetails()) {
      String header = ValidationErrorFormatter.formatHeaderForDetails(parameters.getFormat());
      print(detailsFile, header + "\n");
    }

    if (parameters.collectAllErrors())
      allValidationErrors = new ArrayList<>();
  }

  private File prepareReportFile(String outputDir, String fileName) {
    File reportFile = new File(outputDir, fileName);
    if (reportFile.exists())
      if (!reportFile.delete())
        logger.log(Level.SEVERE, "File {} hasn't been deleted", reportFile.getAbsolutePath());
    return reportFile;
  }

  @Override
  public void fileOpened(Path currentFile) {
    // do nothing
  }

  @Override
  public void fileProcessed() {
    // do nothing
  }

  @Override
  public void processRecord(Record marc4jRecord, int recordNumber) throws IOException {
    // do nothing
  }

  @Override
  public void processRecord(BibliographicRecord marcRecord, int i) {
    if (marcRecord.getId() == null)
      logger.severe("No record number at " + i);

    if (i % 100000 == 0)
      logger.info("Number of error types so far: " + instanceBasedErrorCounter.size());

    if (parameters.getRecordIgnorator().isIgnorable(marcRecord)) {
      logger.info("skip " + marcRecord.getId() + " (ignorable record)");
      return;
    }

    Validator validator = new Validator(validatorConfiguration);
    boolean isValid = validator.validate(marcRecord);
    /*
    boolean isValid = marcRecord.validate(parameters.getMarcVersion(),
                                          parameters.doSummary(),
                                          parameters.getIgnorableFields(),
                                          parameters.getIgnorableIssueTypes()
    );
     */
    if (!isValid && doPrintInProcessRecord) {
      if (parameters.doSummary())
        processSummary(marcRecord, validator);

      if (parameters.doDetails())
        processDetails(marcRecord, validator);
    } else {
      if (parameters.doSummary())
        count(0, totalRecordCounter);
    }
    if (parameters.collectAllErrors())
      allValidationErrors.addAll(validator.getValidationErrors());
    counter++;
  }

  private void processDetails(BibliographicRecord marcRecord, Validator validator) {
    List errors = validator.getValidationErrors();
    if (!errors.isEmpty()) {
      String message = null;
      if (parameters.doSummary()) {
        Map errorIds = new HashMap<>();
        for (ValidationError error : errors) {
          if (error.getId() == null)
            error.setId(hashedIndex.get(error.hashCode()));
          count(error.getId(), errorIds);
        }
        message = ValidationErrorFormatter.formatSimple(
          marcRecord.getId(parameters.getTrimId()), parameters.getFormat(), errorIds
        );
      } else {
        message = ValidationErrorFormatter.format(errors, parameters.getFormat(), parameters.getTrimId());
      }
      if (message != null)
        print(detailsFile, message);
    }
  }

  private void processSummary(BibliographicRecord marcRecord, Validator validator) {
    List errors = validator.getValidationErrors();
    List allButInvalidFieldErrors = new ArrayList<>();
    Set uniqueErrors = new HashSet<>();
    Set uniqueTypes = new HashSet<>();
    Set uniqueCategories = new HashSet<>();
    for (ValidationError error : errors) {
      if (!instanceBasedErrorCounter.containsKey(error)) {
        error.setId(vErrorId++);
        hashedIndex.put(error.hashCode(), error.getId());
      } else {
        error.setId(hashedIndex.get(error.hashCode()));
      }

      if (!error.getType().equals(ValidationErrorType.FIELD_UNDEFINED)) {
        count(2, totalInstanceCounter);
        allButInvalidFieldErrors.add(error);
      }

      count(error, instanceBasedErrorCounter);
      count(error.getType(), typeInstanceCounter);
      count(error.getType().getCategory(), categoryInstanceCounter);
      count(1, totalInstanceCounter);
      updateErrorCollector(marcRecord.getId(true), error.getId());
      uniqueErrors.add(error.getId());
      uniqueTypes.add(error.getType());
      uniqueCategories.add(error.getType().getCategory());
    }

    for (Integer id : uniqueErrors) {
      count(id, recordBasedErrorCounter);
    }
    for (ValidationErrorType id : uniqueTypes) {
      count(id, typeRecordCounter);
    }
    for (ValidationErrorCategory id : uniqueCategories) {
      count(id, categoryRecordCounter);
    }
    count(1, totalRecordCounter);
    if (!allButInvalidFieldErrors.isEmpty())
      count(2, totalRecordCounter);
  }

  @Override
  public void afterIteration(int numberOfprocessedRecords) {
    logger.info("printCounter");
    this.numberOfprocessedRecords = numberOfprocessedRecords;
    printCounter();

    char separator = getSeparator();
    if (parameters.doSummary()) {
      logger.info("printSummary");
      printSummary(separator);
      logger.info("printCategoryCounts");
      printCategoryCounts();
      logger.info("printTypeCounts");
      printTypeCounts();
      logger.info("printTotalCounts");
      printTotalCounts();
      logger.info("printCollector");
      printCollector();
    }
    logger.info("all printing is DONE");
  }

  private void printCounter() {
    File countFile = prepareReportFile(parameters.getOutputDir(), "count.csv");
    if (parameters.getRecordIgnorator().isEmpty()) {
      printToFile(countFile, "total\n");
      printToFile(countFile, String.valueOf(numberOfprocessedRecords) + "\n");
    } else {
      printToFile(countFile, StringUtils.join(Arrays.asList("total", "processed"), ",") + "\n");
      printToFile(countFile, StringUtils.join(Arrays.asList(numberOfprocessedRecords, counter), ",") + "\n");
    }
  }

  private void printCollector() {
    for (Map.Entry> entry : errorCollector.entrySet()) {
      printCollectorEntry(entry.getKey(), entry.getValue());
    }
  }

  private void printSummary(char separator) {
    String header = ValidationErrorFormatter.formatHeaderForSummary(
      parameters.getFormat()
    );
    print(summaryFile, header + "\n");
    instanceBasedErrorCounter
      .entrySet()
      .stream()
      .sorted((a,b) -> {
        Integer typeIdA = Integer.valueOf(a.getKey().getType().getId());
        Integer typeIdB = Integer.valueOf(b.getKey().getType().getId());
        int result = typeIdA.compareTo(typeIdB);
        if (result == 0) {
          Integer recordCountA = recordBasedErrorCounter.get(a.getKey().getId());
          Integer recordCountB = recordBasedErrorCounter.get(b.getKey().getId());
          result = recordCountB.compareTo(recordCountA);
        }
        return result;
      })
      .forEach(
        entry -> {
          ValidationError error = entry.getKey();
          int instanceCount = entry.getValue();
          List cells = new ArrayList<>();
          cells.add(error.getId());
          cells.addAll(Arrays.asList(ValidationErrorFormatter.asArrayWithoutId(error)));
          cells.addAll(Arrays.asList(instanceCount, recordBasedErrorCounter.get(error.getId())));
          // String formattedOutput = ValidationErrorFormatter.formatForSummary(
          //   error, parameters.getFormat()
          // );
          // print(summaryFile, createRow(
          //   separator, error.getId(), formattedOutput, instanceCount, recordBasedErrorCounter.get(error.getId())
          // ));
          // TODO: separator
          print(summaryFile, CsvUtils.createCsv(cells));
        }
      );
    /*
    for (Map.Entry entry : instanceBasedErrorCounter.entrySet()) {
      ValidationError error = entry.getKey();
      int count = entry.getValue();
      String formattedOutput = ValidationErrorFormatter.formatForSummary(
        error, parameters.getFormat()
      );
      print(summaryFile, createRow(
        separator, error.getId(), formattedOutput, count, recordBasedErrorCounter.get(error.getId())
      ));
    }
    */
  }

  private void printTypeCounts() {
    var path = Paths.get(parameters.getOutputDir(), "issue-by-type.csv");
    try (var writer = Files.newBufferedWriter(path)) {
      writer.write(createRow("id", "categoryId", "category", "type", "instances", "records"));
      typeRecordCounter
        .entrySet()
        .stream()
        .sorted((a, b) -> ((Integer)a.getKey().getId()).compareTo((Integer) b.getKey().getId()))
        .forEach(entry -> {
          ValidationErrorType type = entry.getKey();
          int records = entry.getValue();
          int instances = typeInstanceCounter.get(entry.getKey());
          try {
            writer.write(createRow(
              type.getId(), type.getCategory().getId(), type.getCategory().getName(), quote(type.getMessage()), instances, records
            ));
          } catch (IOException e) {
            logger.log(Level.SEVERE, "printTypeCounts", e);
          }
        });
    } catch (IOException e) {
      logger.log(Level.SEVERE, "printTypeCounts", e);
    }
  }

  private void printTotalCounts() {
    var path = Paths.get(parameters.getOutputDir(), "issue-total.csv");
    try (var writer = Files.newBufferedWriter(path)) {
      writer.write(createRow("type", "instances", "records"));
      // writer.write(createRow("total", totalInstanceCounter.get(1), totalRecordCounter.get(1)));
      totalRecordCounter
        .entrySet()
        .stream()
        .forEach(entry -> {
          int records = entry.getValue();
          int instances = totalInstanceCounter.getOrDefault(entry.getKey(), 0);
          try {
            writer.write(createRow(entry.getKey(), instances, records));
          } catch (IOException e) {
            logger.log(Level.SEVERE, "printTotalCounts", e);
          }
        });
    } catch (IOException e) {
      logger.log(Level.SEVERE, "printTotalCounts", e);
    }
  }

  private void printCategoryCounts() {
    var path = Paths.get(parameters.getOutputDir(), "issue-by-category.csv");
    try (var writer = Files.newBufferedWriter(path)) {
      writer.write(createRow("id", "category", "instances", "records"));
      categoryRecordCounter
        .entrySet()
        .stream()
        .sorted((a, b) -> ((Integer)a.getKey().getId()).compareTo((Integer) b.getKey().getId()))
        .forEach(entry -> {
          ValidationErrorCategory category = entry.getKey();
          int records = entry.getValue();
          int instances = categoryInstanceCounter.getOrDefault(entry.getKey(), -1);
          try {
            writer.write(createRow(category.getId(), category.getName(), instances, records));
          } catch (IOException e) {
            logger.log(Level.SEVERE, "printCategoryCounts", e);
          }
        });
    } catch (IOException e) {
      logger.log(Level.SEVERE, "printCategoryCounts", e);
    }
  }

  private char getSeparator() {
    if (!hasSeparator) {
      separator = parameters.getFormat().equals(TAB_SEPARATED) ? '\t' : ',';
    }
    return separator;
  }

  private void printCollectorEntry(Integer errorId, Set recordIds) {
    print(collectorFile, String.valueOf(errorId) + separator);
    boolean isFirst = true;
    for (String recordId : recordIds) {
      print(collectorFile, (isFirst ? "" : ";") + recordId);
      if (isFirst)
        isFirst = false;
    }
    print(collectorFile, "\n");
  }

  private void print(File file, String message) {
    if (parameters.useStandardOutput())
      System.out.print(message);
    else {
      printToFile(file, message);
    }
  }

  private void printToFile(File file, String message) {
    try {
      FileUtils.writeStringToFile(file, message, Charset.defaultCharset(), true);
    } catch (IOException e) {
      if (parameters.doLog())
        logger.log(Level.SEVERE, "printToFile", e);
    }
  }

  private void updateErrorCollector(String recordId, int errorId) {
    if (!errorCollector.containsKey(errorId)) {
      errorCollector.put(errorId, new HashSet<>());
    } else if (parameters.doEmptyLargeCollectors()) {
      if (errorCollector.get(errorId).size() >= 1000) {
        printCollectorEntry(errorId, errorCollector.get(errorId));
        errorCollector.put(errorId, new HashSet<>());
      }
    }
    errorCollector.get(errorId).add(recordId);
  }

  public boolean doPrintInProcessRecord() {
    return doPrintInProcessRecord;
  }

  public void setDoPrintInProcessRecord(boolean doPrintInProcessRecord) {
    this.doPrintInProcessRecord = doPrintInProcessRecord;
  }

  @Override
  public boolean readyToProcess() {
    return readyToProcess;
  }

  public List getAllValidationErrors() {
    return allValidationErrors;
  }

  public int getCounter() {
    return counter;
  }

  public int getNumberOfprocessedRecords() {
    return numberOfprocessedRecords;
  }

  public ValidatorConfiguration getValidityConfiguration() {
    return validatorConfiguration;
  }

  private class Counter {
    int id;
    int count;

    public Counter(int count, int id) {
      this.count = count;
      this.id = id;
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy