
de.gwdg.metadataqa.marc.cli.spark.ParallelValidator Maven / Gradle / Ivy
package de.gwdg.metadataqa.marc.cli.spark;
import de.gwdg.metadataqa.marc.MarcFactory;
import de.gwdg.metadataqa.marc.analysis.validator.Validator;
import de.gwdg.metadataqa.marc.analysis.validator.ValidatorConfiguration;
import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
import de.gwdg.metadataqa.marc.cli.parameters.ValidatorParameters;
import de.gwdg.metadataqa.marc.cli.ValidatorCli;
import de.gwdg.metadataqa.marc.definition.MarcFormat;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorFormatter;
import de.gwdg.metadataqa.marc.utils.QAMarcReaderFactory;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.marc4j.MarcReader;
import org.marc4j.marc.Record;
import java.util.logging.Logger;
public class ParallelValidator {
private static final Logger logger = Logger.getLogger(
ParallelValidator.class.getCanonicalName());
private static Options options = new Options();
public static void main(String[] args) throws ParseException {
final ValidatorCli validatorCli = new ValidatorCli(args);
ValidatorParameters params = validatorCli.getParameters();
final ValidatorConfiguration validatorConfiguration = validatorCli.getValidityConfiguration();
validatorCli.setDoPrintInProcessRecord(false);
logger.info("Input file is " + params.getDetailsFileName());
SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
JavaSparkContext context = new JavaSparkContext(conf);
System.err.println(validatorCli.getParameters().formatParameters());
JavaRDD inputFile = context.textFile(validatorCli.getParameters().getArgs()[0]);
JavaRDD baseCountsRDD = inputFile
.flatMap(content -> {
MarcReader reader = QAMarcReaderFactory.getStringReader(MarcFormat.ISO, content);
Record marc4jRecord = reader.next();
BibliographicRecord marcRecord = MarcFactory.createFromMarc4j(
marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.getReplecementInControlFields());
validatorCli.processRecord(marcRecord, 1);
Validator analyzer = new Validator(validatorConfiguration);
analyzer.validate(marcRecord);
return ValidationErrorFormatter
.formatForSummary(analyzer.getValidationErrors(), params.getFormat())
.iterator();
}
);
baseCountsRDD.saveAsTextFile(validatorCli.getParameters().getDetailsFileName());
}
private static void help() {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("java -cp [jar] de.gwdg.europeanaqa.spark.MarcCompletenessCount [options]", options);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy