
de.gwdg.metadataqa.marc.cli.Completeness Maven / Gradle / Ivy
package de.gwdg.metadataqa.marc.cli;
import de.gwdg.metadataqa.marc.*;
import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters;
import de.gwdg.metadataqa.marc.cli.parameters.CompletenessParameters;
import de.gwdg.metadataqa.marc.cli.plugin.CompletenessFactory;
import de.gwdg.metadataqa.marc.cli.plugin.CompletenessPlugin;
import de.gwdg.metadataqa.marc.cli.processor.BibliographicInputProcessor;
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
import de.gwdg.metadataqa.marc.cli.utils.ignorablerecords.RecordFilter;
import de.gwdg.metadataqa.marc.cli.utils.ignorablerecords.RecordIgnorator;
import de.gwdg.metadataqa.marc.dao.DataField;
import de.gwdg.metadataqa.marc.dao.MarcControlField;
import de.gwdg.metadataqa.marc.dao.MarcPositionalControlField;
import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
import de.gwdg.metadataqa.marc.definition.ControlValue;
import de.gwdg.metadataqa.marc.definition.structure.DataFieldDefinition;
import de.gwdg.metadataqa.marc.definition.tags.TagCategory;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorFormat;
import de.gwdg.metadataqa.marc.utils.BasicStatistics;
import de.gwdg.metadataqa.marc.utils.TagHierarchy;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.marc4j.marc.Record;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import static de.gwdg.metadataqa.marc.Utils.createRow;
import static de.gwdg.metadataqa.marc.Utils.quote;
public class Completeness implements BibliographicInputProcessor, Serializable {
private static final Logger logger = Logger.getLogger(Completeness.class.getCanonicalName());
private static final Pattern dataFieldPattern = Pattern.compile("^(\\d\\d\\d)\\$(.*)$");
private static final Pattern numericalPattern = Pattern.compile("^(\\d)$");
private final Options options;
private CompletenessParameters parameters;
private Map library003Counter = new TreeMap<>();
private Map libraryCounter = new TreeMap<>();
private Map> packageCounter = new TreeMap<>();
private Map> elementCardinality = new TreeMap<>();
private Map> elementFrequency = new TreeMap<>();
// private Map tagCache = new HashMap<>();
// private Map libraryMap = new HashMap<>();
// private Map fieldMap = new HashMap<>();
private Map> fieldHistogram = new HashMap<>();
private boolean readyToProcess;
private CompletenessPlugin plugin;
private Map packageNameCache = new HashMap<>();
private RecordFilter recordFilter;
private RecordIgnorator recordIgnorator;
public Completeness(String[] args) throws ParseException {
parameters = new CompletenessParameters(args);
options = parameters.getOptions();
readyToProcess = true;
plugin = CompletenessFactory.create(parameters);
recordFilter = parameters.getRecordFilter();
recordIgnorator = parameters.getRecordIgnorator();
}
public static void main(String[] args) {
BibliographicInputProcessor processor = null;
try {
processor = new Completeness(args);
} catch (ParseException e) {
System.err.println("ERROR. " + e.getLocalizedMessage());
System.exit(0);
}
if (processor.getParameters().getArgs().length < 1) {
System.err.println("Please provide a MARC file name!");
processor.printHelp(processor.getParameters().getOptions());
System.exit(0);
}
if (processor.getParameters().doHelp()) {
processor.printHelp(processor.getParameters().getOptions());
System.exit(0);
}
RecordIterator iterator = new RecordIterator(processor);
iterator.start();
}
@Override
public CommonParameters getParameters() {
return parameters;
}
@Override
public void processRecord(Record marc4jRecord, int recordNumber) throws IOException {
// do nothing
}
@Override
public void processRecord(BibliographicRecord marcRecord, int recordNumber) throws IOException {
if (!recordFilter.isAllowable(marcRecord))
return;
if (recordIgnorator.isIgnorable(marcRecord))
return;
Map recordFrequency = new TreeMap<>();
Map recordPackageCounter = new TreeMap<>();
// private Map>
String documentType = plugin.getDocumentType(marcRecord);
elementCardinality.computeIfAbsent(documentType, s -> new TreeMap<>());
elementFrequency.computeIfAbsent(documentType, s -> new TreeMap<>());
if (marcRecord.getControl003() != null)
count(marcRecord.getControl003().getContent(), library003Counter);
for (String library : extract(marcRecord, "852", "a")) {
count(library, libraryCounter);
}
if (!parameters.isPica()) {
processLeader(marcRecord, recordFrequency, recordPackageCounter, documentType);
processSimpleControlfields(marcRecord, recordFrequency, recordPackageCounter, documentType);
processPositionalControlFields(marcRecord, recordFrequency, recordPackageCounter, documentType);
}
processDataFields(marcRecord, recordFrequency, recordPackageCounter, documentType);
for (String key : recordFrequency.keySet()) {
count(key, elementFrequency.get(documentType));
count(key, elementFrequency.get("all"));
fieldHistogram.computeIfAbsent(key, s -> new TreeMap<>());
count(recordFrequency.get(key), fieldHistogram.get(key));
}
for (String key : recordPackageCounter.keySet()) {
packageCounter.computeIfAbsent(documentType, s -> new TreeMap<>());
count(key, packageCounter.get(documentType));
count(key, packageCounter.get("all"));
}
}
private void processLeader(BibliographicRecord marcRecord, Map recordFrequency, Map recordPackageCounter, String documentType) {
if (marcRecord.getLeader() != null) {
for (ControlValue position : marcRecord.getLeader().getValuesList()) {
String marcPath = position.getDefinition().getId();
count(marcPath, elementCardinality.get(documentType));
count(marcPath, elementCardinality.get("all"));
count(marcPath, recordFrequency);
count(TagCategory.TAGS_00X.getPackageName(), recordPackageCounter);
}
}
}
private void processSimpleControlfields(BibliographicRecord marcRecord, Map recordFrequency, Map recordPackageCounter, String documentType) {
for (MarcControlField field : marcRecord.getSimpleControlfields()) {
if (field != null) {
String marcPath = field.getDefinition().getTag();
count(marcPath, elementCardinality.get(documentType));
count(marcPath, elementCardinality.get("all"));
count(marcPath, recordFrequency);
count(TagCategory.TAGS_00X.getPackageName(), recordPackageCounter);
}
}
}
private void processPositionalControlFields(BibliographicRecord marcRecord,
Map recordFrequency,
Map recordPackageCounter,
String documentType) {
for (MarcPositionalControlField field : marcRecord.getPositionalControlfields()) {
if (field != null) {
for (ControlValue position : field.getValuesList()) {
String marcPath = position.getDefinition().getId();
count(marcPath, elementCardinality.get(documentType));
count(marcPath, elementCardinality.get("all"));
count(marcPath, recordFrequency);
count(TagCategory.TAGS_00X.getPackageName(), recordPackageCounter);
}
}
}
}
private void processDataFields(BibliographicRecord marcRecord,
Map recordFrequency,
Map recordPackageCounter,
String documentType) {
for (DataField field : marcRecord.getDatafields()) {
if (parameters.getIgnorableFields().contains(field.getTag()))
continue;
count(getPackageName(field), recordPackageCounter);
count(field.getTag(), elementCardinality.get(documentType));
count(field.getTag(), elementCardinality.get("all"));
count(field.getTag(), recordFrequency);
List marcPaths = getMarcPaths(field);
for (String marcPath : marcPaths) {
count(marcPath, elementCardinality.get(documentType));
count(marcPath, elementCardinality.get("all"));
count(marcPath, recordFrequency);
}
}
}
private List getMarcPaths(DataField field) {
List marcPaths = new ArrayList<>();
if (parameters.isMarc21()) {
if (field.getInd1() != null)
if (field.getDefinition() != null && field.getDefinition().getInd1().exists() || !field.getInd1().equals(" "))
marcPaths.add(String.format("%s$!ind1", field.getTag()));
if (field.getInd2() != null)
if (field.getDefinition() != null && field.getDefinition().getInd2().exists() || !field.getInd2().equals(" "))
marcPaths.add(String.format("%s$!ind2", field.getTag()));
}
for (MarcSubfield subfield : field.getSubfields())
if (numericalPattern.matcher(subfield.getCode()).matches())
marcPaths.add(String.format("%s$|%s", field.getTag(), subfield.getCode()));
else
marcPaths.add(String.format("%s$%s", field.getTag(), subfield.getCode()));
return marcPaths;
}
private String getPackageName(DataField field) {
String packageName;
if (field.getDefinition() != null) {
if (packageNameCache.containsKey(field.getDefinition()))
packageName = packageNameCache.get(field.getDefinition());
else {
packageName = plugin.getPackageName(field);
if (StringUtils.isBlank(packageName)) {
logger.warning(String.format("%s has no package. /%s", field, field.getDefinition().getClass()));
packageName = TagCategory.OTHER.getPackageName();
}
packageNameCache.put(field.getDefinition(), packageName);
}
} else {
packageName = TagCategory.OTHER.getPackageName();
}
return packageName;
}
private List extract(BibliographicRecord marcRecord, String tag, String subfield) {
List values = new ArrayList<>();
List fields = marcRecord.getDatafield(tag);
if (fields != null && !fields.isEmpty()) {
for (DataField field : fields) {
List subfieldInstances = field.getSubfield(subfield);
if (subfieldInstances != null) {
for (MarcSubfield subfieldInstance : subfieldInstances) {
values.add(subfieldInstance.getValue());
}
}
}
}
return values;
}
private void count(T key, Map counter) {
counter.computeIfAbsent(key, s -> 0);
counter.put(key, counter.get(key) + 1);
}
@Override
public void beforeIteration() {
logger.info(parameters.formatParameters());
elementCardinality.put("all", new TreeMap<>());
elementFrequency.put("all", new TreeMap<>());
packageCounter.put("all", new TreeMap<>());
}
@Override
public void fileOpened(Path file) {
// do nothing
}
@Override
public void fileProcessed() {
// do nothing
}
@Override
public void afterIteration(int numberOfprocessedRecords) {
String fileExtension = ".csv";
final char separator = getSeparator(parameters.getFormat());
if (parameters.getFormat().equals(ValidationErrorFormat.TAB_SEPARATED)) {
fileExtension = ".tsv";
}
saveLibraries003(fileExtension, separator);
saveLibraries(fileExtension, separator);
savePackages(fileExtension, separator);
saveMarcElements(fileExtension, separator);
}
private void saveLibraries003(String fileExtension, char separator) {
logger.info("Saving Libraries003 file");
var path = Paths.get(parameters.getOutputDir(), "libraries003" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write("library" + separator + "count\n");
library003Counter
.entrySet()
.stream()
.forEach(entry -> {
try {
writer.write(String.format("\"%s\"%s%d%n", entry.getKey(), separator, entry.getValue()));
} catch (IOException e) {
logger.log(Level.SEVERE, "saveLibraries003", e);
}
});
} catch (IOException e) {
logger.log(Level.SEVERE, "saveLibraries003", e);
}
}
private void saveMarcElements(String fileExtension, char separator) {
System.err.println("MARC elements");
Path path = Paths.get(parameters.getOutputDir(), "marc-elements" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow(
"documenttype", "path", "packageid", "package", "tag", "subfield",
"number-of-record", "number-of-instances",
"min", "max", "mean", "stddev", "histogram"
));
elementCardinality
.keySet()
.stream()
.forEach(documentType ->
elementCardinality
.get(documentType)
.entrySet()
.stream()
.forEach(entry -> {
try {
String marcPath = entry.getKey();
int cardinality = entry.getValue();
writer.write(formatCardinality(separator, marcPath, cardinality, documentType));
} catch (IOException e) {
logger.log(Level.SEVERE, "saveMarcElements", e);
}
})
);
} catch (IOException e) {
logger.log(Level.SEVERE, "saveMarcElements", e);
}
}
private void savePackages(String fileExtension, char separator) {
logger.info("saving Packages...");
var path = Paths.get(parameters.getOutputDir(), "packages" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow(separator, "documenttype", "packageid", "name", "label", "iscoretag", "count"));
packageCounter
.forEach((documentType, packages) ->
packages.forEach((packageName, count) -> {
try {
TagCategory tagCategory = TagCategory.getPackage(packageName);
String range = packageName;
String label = "";
int id = 100;
boolean isPartOfMarcScore = false;
if (tagCategory != null) {
id = tagCategory.getId();
range = tagCategory.getRange();
label = tagCategory.getLabel();
isPartOfMarcScore = tagCategory.isPartOfMarcCore();
} else {
logger.severe(packageName + " has not been found in TagCategory");
}
writer.write(createRow(
separator, quote(documentType), id, quote(range), quote(label), isPartOfMarcScore, count
));
} catch (IOException e) {
logger.log(Level.SEVERE, "savePackages", e);
}
})
);
} catch (IOException e) {
logger.log(Level.SEVERE, "savePackages", e);
}
}
private void saveLibraries(String fileExtension, char separator) {
logger.info("Saving Libraries");
var path = Paths.get(parameters.getOutputDir(), "libraries" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write("library" + separator + "count\n");
libraryCounter
.entrySet()
.stream()
.forEach(entry -> {
try {
writer.write(String.format("\"%s\"%s%d%n", entry.getKey(), separator, entry.getValue()));
} catch (IOException e) {
logger.log(Level.SEVERE, "saveLibraries", e);
}
});
} catch (IOException e) {
logger.log(Level.SEVERE, "saveLibraries", e);
}
}
private String formatCardinality(char separator,
String marcPath,
int cardinality,
String documentType) {
if (marcPath.equals("")) {
logger.severe("Empty key from " + marcPath);
}
String marcPathLabel = marcPath.replace("!ind", "ind").replaceAll("\\|(\\d)$", "$1");
int packageId = TagCategory.OTHER.getId();
String packageLabel = TagCategory.OTHER.getLabel();
String tagLabel = "";
String subfieldLabel = "";
TagHierarchy tagHierarchy = plugin.getTagHierarchy(marcPathLabel);
if (tagHierarchy != null) {
packageId = tagHierarchy.getPackageId();
packageLabel = tagHierarchy.getPackageLabel();
tagLabel = tagHierarchy.getTagLabel();
subfieldLabel = tagHierarchy.getSubfieldLabel();
} else {
logger.severe("Key can not be found in the TagHierarchy: " + marcPathLabel);
}
// Integer cardinality = entry.getValue();
Integer frequency = elementFrequency.get(documentType).get(marcPath);
BasicStatistics statistics = new BasicStatistics(fieldHistogram.get(marcPath));
if (!fieldHistogram.containsKey(marcPath)) {
logger.warning(String.format("Field %s is not registered in histogram", marcPath));
}
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy