
de.gwdg.metadataqa.marc.cli.FunctionalAnalysis Maven / Gradle / Ivy
package de.gwdg.metadataqa.marc.cli;
import de.gwdg.metadataqa.marc.*;
import de.gwdg.metadataqa.marc.cli.parameters.CompletenessParameters;
import de.gwdg.metadataqa.marc.cli.processor.BibliographicInputProcessor;
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
import de.gwdg.metadataqa.marc.dao.DataField;
import de.gwdg.metadataqa.marc.dao.MarcControlField;
import de.gwdg.metadataqa.marc.dao.MarcPositionalControlField;
import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
import de.gwdg.metadataqa.marc.definition.ControlValue;
import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
import de.gwdg.metadataqa.marc.definition.structure.DataFieldDefinition;
import de.gwdg.metadataqa.marc.definition.FRBRFunction;
import de.gwdg.metadataqa.marc.definition.structure.Indicator;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorFormat;
import de.gwdg.metadataqa.marc.utils.Counter;
import de.gwdg.metadataqa.marc.utils.FrbrFunctionLister;
import de.gwdg.metadataqa.marc.utils.FunctionValue;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.marc4j.marc.Record;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import static de.gwdg.metadataqa.marc.Utils.createRow;
public class FunctionalAnalysis implements BibliographicInputProcessor, Serializable {
private static final Logger logger = Logger.getLogger(FunctionalAnalysis.class.getCanonicalName());
private final Options options;
private final boolean readyToProcess;
private final CompletenessParameters parameters;
private FrbrFunctionLister frbrFunctionLister;
private int recordNumber;
public FunctionalAnalysis(String[] args) throws ParseException {
parameters = new CompletenessParameters(args);
options = parameters.getOptions();
readyToProcess = true;
frbrFunctionLister = new FrbrFunctionLister(parameters.getSchemaType(), parameters.getMarcVersion());
logger.info(frbrFunctionLister.getBaseline().toString());
}
public static void main(String[] args) {
BibliographicInputProcessor processor = null;
try {
processor = new FunctionalAnalysis(args);
} catch (ParseException e) {
logger.log(Level.SEVERE, "FunctionalAnalysis", e);
System.exit(0);
}
if (processor.getParameters().getArgs().length < 1) {
logger.severe("Please provide a MARC file name!");
processor.printHelp(processor.getParameters().getOptions());
System.exit(0);
}
if (processor.getParameters().doHelp()) {
processor.printHelp(processor.getParameters().getOptions());
System.exit(0);
}
RecordIterator iterator = new RecordIterator(processor);
iterator.start();
}
@Override
public CompletenessParameters getParameters() {
return parameters;
}
@Override
public void processRecord(Record marc4jRecord, int recordNumber) throws IOException {
}
@Override
public void processRecord(BibliographicRecord bibliographicRecord, int recordNumber) throws IOException {
if (parameters.getRecordIgnorator().isIgnorable(bibliographicRecord))
return;
this.recordNumber = recordNumber;
Map recordCounter = new TreeMap<>();
for (FRBRFunction f : FRBRFunction.values())
if (f.getParent() != null)
recordCounter.put(f, new FunctionValue());
Map cache = new HashMap<>();
if (bibliographicRecord.getSchemaType().equals(SchemaType.MARC21)) {
countPositionalControlField(recordCounter, bibliographicRecord.getLeader());
countControlFields(recordCounter, bibliographicRecord.getControlfields());
}
countDataFields(recordCounter, bibliographicRecord.getDatafields(), bibliographicRecord.getSchemaType(), cache);
frbrFunctionLister.calculatePercent(recordCounter);
frbrFunctionLister.add(recordCounter);
frbrFunctionLister.addToHistogram(recordCounter);
}
private void countDataFields(Map recordCounter,
List dataFields,
SchemaType schemaType,
Map cache) {
for (DataField dataField : dataFields) {
DataFieldDefinition definition = dataField.getDefinition();
if (!cache.containsKey(definition)) {
cache.put(definition, true);
if (definition != null && schemaType.equals(SchemaType.MARC21)) {
countIndicator(recordCounter, definition.getInd1(), dataField.getInd1());
countIndicator(recordCounter, definition.getInd2(), dataField.getInd2());
}
if (schemaType.equals(SchemaType.MARC21)) {
for (MarcSubfield subfield : dataField.getSubfields())
if (subfield.getDefinition() != null && subfield.getDefinition().getFrbrFunctions() != null)
FrbrFunctionLister.countFunctions(subfield.getDefinition().getFrbrFunctions(), recordCounter);
} else if (schemaType.equals(SchemaType.PICA)) {
for (MarcSubfield subfield : dataField.getSubfields()) {
String key = dataField.getTag() + "$" + subfield.getCode();
if (frbrFunctionLister.getFunctionByPicaPath().containsKey(key))
FrbrFunctionLister.countFunctions(frbrFunctionLister.getFunctionByPicaPath().get(key), recordCounter);
}
}
}
}
}
private void countIndicator(Map recordCounter,
Indicator definition,
String value) {
if (definition.getFrbrFunctions() != null
&& StringUtils.isNotBlank(value)) {
FrbrFunctionLister.countFunctions(
definition.getFrbrFunctions(), recordCounter);
}
}
private void countControlFields(Map recordCounter,
List controlFields) {
for (MarcControlField controlField : controlFields) {
if (controlField == null) {
continue;
}
if (controlField instanceof MarcPositionalControlField) {
countPositionalControlField(recordCounter, (MarcPositionalControlField) controlField);
} else {
FrbrFunctionLister.countFunctions(
controlField.getDefinition().getFrbrFunctions(), recordCounter
);
}
}
}
private void countPositionalControlField(Map recordCounter,
MarcPositionalControlField leader) {
for (ControlValue controlValue : leader.getValuesList()) {
FrbrFunctionLister.countFunctions(
controlValue.getDefinition().getFrbrFunctions(), recordCounter
);
}
}
@Override
public void beforeIteration() {
// do nothing
}
@Override
public void fileOpened(Path path) {
// do nothing
}
@Override
public void fileProcessed() {
// do nothing
}
@Override
public void afterIteration(int numberOfprocessedRecords) {
String fileExtension = ".csv";
final char separator = getSeparator(parameters.getFormat());
if (parameters.getFormat().equals(ValidationErrorFormat.TAB_SEPARATED)) {
fileExtension = ".tsv";
}
Map> result = frbrFunctionLister.percentOf(recordNumber);
saveResult(result, fileExtension, separator);
Map> percentHistogram = frbrFunctionLister.getHistogram();
saveHistogram(percentHistogram, fileExtension, separator);
saveMapping(fileExtension, separator);
}
private void saveMapping(String fileExtension,
char separator) {
Map> functions = null;
if (parameters.getSchemaType().equals(SchemaType.MARC21))
functions = frbrFunctionLister.getMarcPathByFunction();
else if (parameters.getSchemaType().equals(SchemaType.PICA))
functions = frbrFunctionLister.getPicaPathByFunctionConcensed();
var path = Paths.get(parameters.getOutputDir(), "functional-analysis-mapping" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write("frbrfunction" + separator + "count" + separator + "fields\n");
for (FRBRFunction function : FRBRFunction.values()) {
if (function.getParent() != null) {
List paths = functions.getOrDefault(function, new ArrayList<>());
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy