
de.gwdg.metadataqa.marc.cli.AuthorityAnalysis Maven / Gradle / Ivy
package de.gwdg.metadataqa.marc.cli;
import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
import de.gwdg.metadataqa.marc.Utils;
import de.gwdg.metadataqa.marc.analysis.AuthorithyAnalyzer;
import de.gwdg.metadataqa.marc.analysis.AuthorityCategory;
import de.gwdg.metadataqa.marc.analysis.AuthorityStatistics;
import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters;
import de.gwdg.metadataqa.marc.cli.parameters.ValidatorParameters;
import de.gwdg.metadataqa.marc.cli.processor.BibliographicInputProcessor;
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
import de.gwdg.metadataqa.marc.cli.utils.Schema;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.marc4j.marc.Record;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import static de.gwdg.metadataqa.marc.Utils.count;
import static de.gwdg.metadataqa.marc.Utils.quote;
public class AuthorityAnalysis implements BibliographicInputProcessor, Serializable {
private static final Logger logger = Logger.getLogger(AuthorityAnalysis.class.getCanonicalName());
private CommonParameters parameters;
private Map histogram = new HashMap<>();
private Map frequencyExamples = new HashMap<>();
private Map hasClassifications = new HashMap<>();
private boolean readyToProcess;
private static char separator = ',';
AuthorityStatistics statistics = new AuthorityStatistics();
public AuthorityAnalysis(String[] args) throws ParseException {
parameters = new ValidatorParameters(args);
readyToProcess = true;
}
public static void main(String[] args) {
BibliographicInputProcessor processor = null;
try {
processor = new AuthorityAnalysis(args);
} catch (ParseException e) {
System.err.println(createRow("ERROR. ", e.getLocalizedMessage()));
// processor.printHelp(processor.getParameters().getOptions());
System.exit(0);
}
if (processor.getParameters().getArgs().length < 1) {
System.err.println("Please provide a MARC file name!");
processor.printHelp(processor.getParameters().getOptions());
System.exit(0);
}
if (processor.getParameters().doHelp()) {
processor.printHelp(processor.getParameters().getOptions());
System.exit(0);
}
var iterator = new RecordIterator(processor);
iterator.start();
}
@Override
public CommonParameters getParameters() {
return parameters;
}
@Override
public void processRecord(Record marc4jRecord, int recordNumber) throws IOException {
// do nothing
}
@Override
public void processRecord(BibliographicRecord marcRecord, int recordNumber) throws IOException {
if (parameters.getRecordIgnorator().isIgnorable(marcRecord))
return;
var analyzer = new AuthorithyAnalyzer(marcRecord, statistics);
int count = analyzer.process();
count((count > 0), hasClassifications);
count(count, histogram);
frequencyExamples.computeIfAbsent(count, s -> marcRecord.getId(true));
}
@Override
public void beforeIteration() {
}
@Override
public void fileOpened(Path path) {
}
@Override
public void fileProcessed() {
}
@Override
public void afterIteration(int numberOfprocessedRecords) {
printAuthoritiesByCategories();
printAuthoritiesBySchema();
printAuthoritiesByRecords();
printAuthoritiesHistogram();
printFrequencyExamples();
printAuthoritiesSubfieldsStatistics();
}
private void printAuthoritiesByCategories() {
var path = Paths.get(parameters.getOutputDir(), "authorities-by-categories.csv");
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow("category", "recordcount", "instancecount"));
statistics.getRecordsPerCategories()
.entrySet()
.stream()
.forEach(
entry -> {
AuthorityCategory category = entry.getKey();
int recordCount = entry.getValue();
int instanceCount = statistics.getInstancesPerCategories().get(category);
try {
writer.write(createRow(
quote(category.getLabel()),
recordCount,
instanceCount
));
} catch (IOException | NullPointerException ex) {
logger.log(Level.SEVERE, "build", ex);
logger.severe(category.toString());
}
}
);
} catch (IOException e) {
logger.log(Level.SEVERE, "printAuthoritiesByCategories", e);
}
}
private void printAuthoritiesBySchema() {
var path = Paths.get(parameters.getOutputDir(), "authorities-by-schema.csv");
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow("id", "field", "location", "scheme", "abbreviation", "abbreviation4solr", "recordcount", "instancecount"));
statistics.getInstances()
.entrySet()
.stream()
.sorted((e1, e2) -> {
int i = e1.getKey().getField().compareTo(e2.getKey().getField());
if (i != 0)
return i;
else {
i = e1.getKey().getLocation().compareTo(e2.getKey().getLocation());
if (i != 0)
return i;
else
return e2.getValue().compareTo(e1.getValue());
}
}
)
.forEach(
entry -> printSingleClassificationBySchema(writer, entry)
);
} catch (IOException e) {
logger.log(Level.SEVERE, "printAuthoritiesBySchema", e);
}
}
private void printSingleClassificationBySchema(BufferedWriter writer, Map.Entry entry) {
Schema schema = entry.getKey();
int instanceCount = entry.getValue();
int recordCount = statistics.getRecords().get(schema);
try {
writer.write(createRow(
schema.getId(),
schema.getField(),
schema.getLocation(),
'"' + schema.getSchema().replace("\"", "\\\"") + '"',
schema.getAbbreviation(),
Utils.solarize(schema.getAbbreviation()),
recordCount,
instanceCount
));
} catch (IOException | NullPointerException e) {
logger.log(Level.SEVERE, "printSingleClassificationBySchema", e);
System.err.println(schema);
}
}
private void printAuthoritiesByRecords() {
Path path;
path = Paths.get(parameters.getOutputDir(), "authorities-by-records.csv");
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow("records-with-authorities", "count"));
hasClassifications
.entrySet()
.stream()
.sorted((e1, e2) ->
e2.getValue().compareTo(e1.getValue()))
.forEach(
e -> {
try {
writer.write(createRow(e.getKey().toString(), e.getValue()));
} catch (IOException ex) {
logger.log(Level.SEVERE, "printAuthoritiesByRecords", ex);
}
}
);
} catch (IOException e) {
logger.log(Level.SEVERE, "printAuthoritiesByRecords", e);
}
}
private void printAuthoritiesHistogram() {
var path = Paths.get(parameters.getOutputDir(), "authorities-histogram.csv");
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow("count", "frequency"));
histogram
.entrySet()
.stream()
.sorted((e1, e2) -> e1.getKey().compareTo(e2.getKey()))
.forEach(
entry -> {
try {
writer.write(createRow(entry.getKey(), entry.getValue()));
} catch (IOException e) {
logger.log(Level.SEVERE, "printAuthoritiesHistogram", e);
}
}
);
} catch (IOException e) {
logger.log(Level.SEVERE, "printAuthoritiesHistogram", e);
}
}
private void printFrequencyExamples() {
var path = Paths.get(parameters.getOutputDir(), "authorities-frequency-examples.csv");
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow("count", "id"));
frequencyExamples
.entrySet()
.stream()
.sorted((e1, e2) -> e1.getKey().compareTo(e2.getKey()))
.forEach(
entry -> {
try {
writer.write(createRow(entry.getKey(), entry.getValue()));
} catch (IOException e) {
logger.log(Level.SEVERE, "printFrequencyExamples", e);
}
}
);
} catch (IOException e) {
logger.log(Level.SEVERE, "printFrequencyExamples", e);
}
}
private void printAuthoritiesSubfieldsStatistics() {
var path = Paths.get(parameters.getOutputDir(), "authorities-by-schema-subfields.csv");
try (var writer = Files.newBufferedWriter(path)) {
// final List header = Arrays.asList("field", "location", "label", "abbreviation", "subfields", "scount");
final List header = Arrays.asList("id", "subfields", "count");
writer.write(createRow(header));
statistics.getSubfields()
.entrySet()
.stream()
.sorted((e1, e2) ->
e1.getKey().getField().compareTo(e2.getKey().getField()))
.forEach(
schemaEntry -> printSingleSchemaSubfieldsStatistics(writer, schemaEntry)
);
} catch (IOException e) {
logger.log(Level.SEVERE, "printAuthoritiesSubfieldsStatistics", e);
}
}
private void printSingleSchemaSubfieldsStatistics(BufferedWriter writer, Map.Entry, Integer>> schemaEntry) {
Schema schema = schemaEntry.getKey();
Map, Integer> val = schemaEntry.getValue();
val
.entrySet()
.stream()
.sorted((count1, count2) -> count2.getValue().compareTo(count1.getValue()))
.forEach(
countEntry -> {
List subfields = countEntry.getKey();
int count = countEntry.getValue();
try {
writer.write(createRow(
schema.getId(),
// schema.field,
// schema.location,
// '"' + schema.schema.replace("\"", "\\\"") + '"',
// schema.abbreviation,
StringUtils.join(subfields, ';'),
count
));
} catch (IOException ex) {
logger.log(Level.SEVERE, "printSingleSchemaSubfieldsStatistics", ex);
}
}
);
}
private static String createRow(List fields) {
return StringUtils.join(fields, separator) + "\n";
}
private static String createRow(Object... fields) {
return StringUtils.join(fields, separator) + "\n";
}
@Override
public void printHelp(Options options) {
}
@Override
public boolean readyToProcess() {
return readyToProcess;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy