gov.nasa.pds.harvest.proc.ProductProcessor Maven / Gradle / Ivy
package gov.nasa.pds.harvest.proc;
import java.io.File;
import java.util.List;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.w3c.dom.Document;
import gov.nasa.pds.harvest.cfg.HarvestCfg;
import gov.nasa.pds.harvest.job.Job;
import gov.nasa.pds.harvest.meta.Metadata;
import gov.nasa.pds.harvest.meta.ex.AutogenExtractor;
import gov.nasa.pds.harvest.meta.ex.BasicMetadataExtractor;
import gov.nasa.pds.harvest.meta.ex.BundleMetadataExtractor;
import gov.nasa.pds.harvest.meta.ex.FileMetadataExtractor;
import gov.nasa.pds.harvest.meta.ex.InternalReferenceExtractor;
import gov.nasa.pds.harvest.meta.ex.SearchMetadataExtractor;
import gov.nasa.pds.harvest.util.out.RegistryDocWriter;
import gov.nasa.pds.harvest.util.xml.XmlDomUtils;
import gov.nasa.pds.harvest.util.xml.XmlNamespaces;
/**
* Process products (PDS4 XML label files)
* @author karpenko
*/
public class ProductProcessor
{
private Logger log;
// Skip files bigger than 10MB
private static final long MAX_XML_FILE_LENGTH = 10_000_000;
private DocumentBuilderFactory dbf;
// Bundle and Collection extractors & processors
private BundleMetadataExtractor bundleExtractor;
// Common extractors
private BasicMetadataExtractor basicExtractor;
private InternalReferenceExtractor refExtractor;
private AutogenExtractor autogenExtractor;
private SearchMetadataExtractor searchExtractor;
private FileMetadataExtractor fileDataExtractor;
private RegistryDocWriter writer;
/**
* Constructor
* @throws Exception
*/
public ProductProcessor(HarvestCfg cfg, RegistryDocWriter writer) throws Exception
{
if(cfg == null) throw new IllegalArgumentException("Configuration is null");
if(writer == null) throw new IllegalArgumentException("Writer is null");
this.writer = writer;
log = LogManager.getLogger(getClass());
dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(false);
basicExtractor = new BasicMetadataExtractor();
refExtractor = new InternalReferenceExtractor();
autogenExtractor = new AutogenExtractor();
searchExtractor = new SearchMetadataExtractor();
fileDataExtractor = new FileMetadataExtractor(cfg);
bundleExtractor = new BundleMetadataExtractor();
}
/**
* Process one file
* @param file PDS label XML file
* @param job Harvest job configuration parameters
* @throws Exception Generic exception
*/
public void processFile(File file, Job job) throws Exception
{
// Skip very large files
if(file.length() > MAX_XML_FILE_LENGTH)
{
log.warn("File is too big to parse: " + file.getAbsolutePath());
return;
}
Document doc = XmlDomUtils.readXml(dbf, file);
processMetadata(file, doc, job);
}
/**
* Extract metadata from a label file
* @param file PDS label file
* @param doc Parsed XML DOM model of the PDS label file
* @param job Harvest job configuration parameters
* @throws Exception Generic exception
*/
private void processMetadata(File file, Document doc, Job job) throws Exception
{
// Extract basic metadata
Metadata meta = basicExtractor.extract(file, doc, job);
log.info("Processing " + file.getAbsolutePath());
String rootElement = doc.getDocumentElement().getNodeName();
// Process Bundle specific data
if("Product_Bundle".equals(rootElement))
{
addCollectionRefs(meta, doc);
}
// Internal references
refExtractor.addRefs(meta.intRefs, doc);
// Extract fields autogenerated from data dictionary
XmlNamespaces nsInfo = autogenExtractor.extract(file, meta.fields, job);
// Extract search fields
searchExtractor.extract(doc, meta.fields);
// Extract file data
fileDataExtractor.extract(file, meta, job);
// Write metadata
writer.write(meta, nsInfo, job.jobId);
}
private void addCollectionRefs(Metadata meta, Document doc) throws Exception
{
List bmes = bundleExtractor.extractBundleMemberEntries(doc);
for(BundleMetadataExtractor.BundleMemberEntry bme: bmes)
{
bundleExtractor.addRefs(meta.intRefs, bme);
}
}
}