de.gwdg.metadataqa.marc.cli.ClassificationAnalysis Maven / Gradle / Ivy

Go to download
package de.gwdg.metadataqa.marc.cli;

import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
import de.gwdg.metadataqa.marc.Utils;
import de.gwdg.metadataqa.marc.analysis.ClassificationAnalyzer;
import de.gwdg.metadataqa.marc.analysis.ClassificationStatistics;
import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters;
import de.gwdg.metadataqa.marc.cli.parameters.ValidatorParameters;
import de.gwdg.metadataqa.marc.cli.processor.BibliographicInputProcessor;
import de.gwdg.metadataqa.marc.cli.utils.Collocation;
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
import de.gwdg.metadataqa.marc.cli.utils.Schema;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.marc4j.marc.Record;

import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;

import static de.gwdg.metadataqa.marc.Utils.createRow;

public class ClassificationAnalysis implements BibliographicInputProcessor, Serializable {

  private static final Logger logger = Logger.getLogger(ClassificationAnalysis.class.getCanonicalName());

  private final Options options;
  private CommonParameters parameters;
  private boolean readyToProcess;
  private static char separator = ',';
  private File collectorFile;
  ClassificationStatistics statistics = new ClassificationStatistics();

  public ClassificationAnalysis(String[] args) throws ParseException {
    parameters = new ValidatorParameters(args);
    options = parameters.getOptions();
    readyToProcess = true;
    Schema.reset();
  }

  public static void main(String[] args) {
    BibliographicInputProcessor processor = null;
    try {
      processor = new ClassificationAnalysis(args);
    } catch (ParseException e) {
      System.err.println(createRow("ERROR. ", e.getLocalizedMessage()));
      // processor.printHelp(processor.getParameters().getOptions());
      System.exit(0);
    }
    if (processor.getParameters().getArgs().length < 1) {
      System.err.println("Please provide a MARC file name!");
      processor.printHelp(processor.getParameters().getOptions());
      System.exit(0);
    }
    if (processor.getParameters().doHelp()) {
      processor.printHelp(processor.getParameters().getOptions());
      System.exit(0);
    }
    RecordIterator iterator = new RecordIterator(processor);
    iterator.start();
  }

  @Override
  public CommonParameters getParameters() {
    return parameters;
  }

  @Override
  public void processRecord(Record marc4jRecord, int recordNumber) throws IOException {

  }

  @Override
  public void processRecord(BibliographicRecord marcRecord, int recordNumber) throws IOException {
    if (parameters.getRecordIgnorator().isIgnorable(marcRecord))
      return;

    ClassificationAnalyzer analyzer = new ClassificationAnalyzer(marcRecord, statistics);
    analyzer.process();
    var total1 = statistics.getHasClassifications().get(true);
    if (total1 == null)
      total1 = Integer.valueOf(0);
    var total = statistics.recordCountWithClassification();
    if (total1.intValue() != total.intValue()) {
      logger.severe(String.format("%s COUNT: total (%d) != schemasInRecord (%d)",
          marcRecord.getId(true), total1, total));
      readyToProcess = false;
    }

    /*
    List schemas = analyzer.getSchemasInRecord();
    if (!schemas.isEmpty()) {
      List abbreviations = schemas
        .stream()
        .map(Schema::getAbbreviation)
        .distinct()
        .collect(Collectors.toList());
      if (!abbreviations.isEmpty()) {
        String joined = StringUtils.join(abbreviations, ":");
        printToFile(collectorFile, Utils.createRow(marcRecord.getId(true), joined));
      }
    }
    */
  }

  private void printToFile(File file, String message) {
    try {
      FileUtils.writeStringToFile(file, message, Charset.defaultCharset(), true);
    } catch (IOException | NullPointerException e) {
      if (parameters.doLog())
        logger.log(Level.SEVERE, "printToFile", e);
    }
  }

  private File prepareReportFile(String outputDir, String fileName) {
    File reportFile = new File(outputDir, fileName);
    if (reportFile.exists())
      if (!reportFile.delete())
        logger.log(Level.SEVERE, "File {} hasn't been deleted", reportFile.getAbsolutePath());
    return reportFile;
  }


  @Override
  public void beforeIteration() {
    /*
    collectorFile = prepareReportFile(
      parameters.getOutputDir(), "classification-collocations.csv");
     */
  }

  @Override
  public void fileOpened(Path path) {

  }

  @Override
  public void fileProcessed() {

  }

  @Override
  public void afterIteration(int numberOfprocessedRecords) {
    printClassificationsBySchema();
    printClassificationsByRecords();
    printClassificationsHistogram();
    printFrequencyExamples();
    printSchemaSubfieldsStatistics();
    printClassificationsCollocation();
  }

  private void printClassificationsCollocation() {
    Path path;
    path = Paths.get(parameters.getOutputDir(), "classifications-collocations.csv");
    try (var writer = Files.newBufferedWriter(path)) {
      writer.write(Collocation.header());
      var total1 = statistics.getHasClassifications().get(true);
      if (total1 == null)
        total1 = Integer.valueOf(0);
      var total = statistics.recordCountWithClassification();
      logger.info("total: " + total);
      if (total1 != total)
        logger.severe(String.format("total from hasClassifications (%d) != from collation (%d)",
            total1, total));

      statistics.getCollocationHistogram()
        .entrySet()
        .stream()
        .map(e -> new Collocation(e.getKey(), e.getValue(), total))
        .sorted((e1, e2) -> e1.compareTo(e2) * -1)
        .forEach(entry -> printCollocation(writer, entry));
    } catch (IOException e) {
      logger.log(Level.SEVERE, "printClassificationsCollocation", e);
    }
  }

  private void printCollocation(BufferedWriter writer, Collocation entry) {
    try {
      writer.write(entry.formatRow());
    } catch (IOException e) {
      logger.log(Level.SEVERE, "printCollocation", e);
    }
  }

  private void printClassificationsBySchema() {
    Path path;
    path = Paths.get(parameters.getOutputDir(), "classifications-by-schema.csv");
    try (var writer = Files.newBufferedWriter(path)) {
      writer.write(createRow("id", "field", "location", "scheme",
        "abbreviation", "abbreviation4solr", "recordcount", "instancecount",
        "type"
      ));
      statistics.getInstances()
        .entrySet()
        .stream()
        .sorted((e1, e2) -> {
            int i = e1.getKey().getField().compareTo(e2.getKey().getField());
            if (i != 0)
              return i;
            else {
              i = e1.getKey().getLocation().compareTo(e2.getKey().getLocation());
              if (i != 0)
                return i;
              else
                return e2.getValue().compareTo(e1.getValue());
            }
          }
        )
        .forEach(
          entry -> printSingleClassificationBySchema(writer, entry)
        );
    } catch (IOException e) {
      logger.log(Level.SEVERE, "printClassificationsBySchema", e);
    }
  }

  private void printSingleClassificationBySchema(BufferedWriter writer,
                                                 Map.Entry entry) {
    Schema schema = entry.getKey();
    int instanceCount = entry.getValue();
    int recordCount = statistics.getRecords().get(schema);
    try {
      writer.write(createRow(
        schema.getId(),
        schema.getField(),
        schema.getLocation(),
        '"' + schema.getSchema().replace("\"", "\"\"") + '"',
        '"' + schema.getAbbreviation().replace("\"", "\"\"") + '"',
        Utils.solarize(schema.getAbbreviation()),
        recordCount,
        instanceCount,
        (schema.getType() == null ? "UNKNOWN" : schema.getType())
      ));
    } catch (IOException | NullPointerException ex) {
      logger.log(Level.SEVERE, "printClassificationsBySchema", ex);
      logger.severe(schema.toString());
    }
  }

  private void printClassificationsByRecords() {
    Path path;
    path = Paths.get(parameters.getOutputDir(), "classifications-by-records.csv");
    try (var writer = Files.newBufferedWriter(path)) {
      writer.write(createRow("records-with-classification", "count"));
      statistics.getHasClassifications()
        .entrySet()
        .stream()
        .sorted((e1, e2) ->
          e2.getValue().compareTo(e1.getValue()))
        .forEach(
          e -> {
            try {
              writer.write(createRow(e.getKey().toString(), e.getValue()));
            } catch (IOException ex) {
              logger.log(Level.SEVERE, "printClassificationsByRecords", ex);
            }
          }
        );
    } catch (IOException e) {
      logger.log(Level.SEVERE, "printClassificationsByRecords", e);
    }
  }

  private void printClassificationsHistogram() {
    var path = Paths.get(parameters.getOutputDir(), "classifications-histogram.csv");
    try (var writer = Files.newBufferedWriter(path)) {
      writer.write(createRow("count", "frequency"));
      statistics.getSchemaHistogram()
        .entrySet()
        .stream()
        .sorted((e1, e2) -> e1.getKey().compareTo(e2.getKey()))
        .forEach(
          entry -> {
            try {
              writer.write(createRow(entry.getKey(), entry.getValue()));
            } catch (IOException e) {
              logger.log(Level.SEVERE, "printClassificationsHistogram", e);
            }
          }
        );
    } catch (IOException e) {
      logger.log(Level.SEVERE, "printClassificationsHistogram", e);
    }
  }

  private void printFrequencyExamples() {
    var path = Paths.get(parameters.getOutputDir(), "classifications-frequency-examples.csv");
    try (var writer = Files.newBufferedWriter(path)) {
      writer.write(createRow("count", "id"));
      statistics.getFrequencyExamples()
        .entrySet()
        .stream()
        .sorted((e1, e2) -> e1.getKey().compareTo(e2.getKey()))
        .forEach(
          entry -> {
            try {
              writer.write(createRow(entry.getKey(), entry.getValue()));
            } catch (IOException e) {
              logger.log(Level.SEVERE, "printFrequencyExamples", e);
            }
          }
        );
    } catch (IOException e) {
      logger.log(Level.SEVERE, "printFrequencyExamples", e);
    }
  }

  private void printSchemaSubfieldsStatistics() {
    Path path;
    path = Paths.get(parameters.getOutputDir(), "classifications-by-schema-subfields.csv");
    try (var writer = Files.newBufferedWriter(path)) {
      // final List header = Arrays.asList("field", "location", "label", "abbreviation", "subfields", "scount");
      final List header = Arrays.asList("id", "subfields", "count");
      writer.write(createRow(header));
      statistics.getSubfields()
        .entrySet()
        .stream()
        .sorted((e1, e2) ->
          e1.getKey().getField().compareTo(e2.getKey().getField()))
        .forEach(
          schemaEntry -> printSingleSchemaSubfieldsStatistics(writer, schemaEntry)
        );
    } catch (IOException e) {
      logger.log(Level.SEVERE, "printSchemaSubfieldsStatistics", e);
    }
  }

  private void printSingleSchemaSubfieldsStatistics(BufferedWriter writer, Map.Entry, Integer>> schemaEntry) {
    Schema schema = schemaEntry.getKey();
    Map, Integer> val = schemaEntry.getValue();
    val
      .entrySet()
      .stream()
      .sorted((count1, count2) -> count2.getValue().compareTo(count1.getValue()))
      .forEach(
        countEntry -> {
          List subfields = countEntry.getKey();
          int count = countEntry.getValue();
          try {
            writer.write(createRow(
              schema.getId(),
              // schema.field,
              // schema.location,
              // '"' + schema.schema.replace("\"", "\\\"") + '"',
              // schema.abbreviation,
              StringUtils.join(subfields, ';'),
              count
            ));
          } catch (IOException ex) {
            logger.log(Level.SEVERE, "printSingleSchemaSubfieldsStatistics", ex);
          }
        }
      );
  }

  /*
  private static String createRow(List fields) {
    return StringUtils.join(fields, separator) + "\n";
  }

  private static String createRow(Object... fields) {
    return StringUtils.join(fields, separator) + "\n";
  }
   */

  @Override
  public void printHelp(Options options) {

  }

  @Override
  public boolean readyToProcess() {
    return readyToProcess;
  }
  // private
}