
com.gc.iotools.fmt.detect.droid.DroidDetectorImpl Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of wazformat Show documentation
Show all versions of wazformat Show documentation
Format identification utilities
The newest version!
package com.gc.iotools.fmt.detect.droid;
/*
* Copyright (c) 2008, 2014 Gabriele Contini. This source code is released
* under the BSD License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import uk.gov.nationalarchives.droid.base.FileFormatHit;
import uk.gov.nationalarchives.droid.binFileReader.ByteReader;
import uk.gov.nationalarchives.droid.binFileReader.IdentificationFile;
import uk.gov.nationalarchives.droid.binFileReader.RandomAccessByteReader;
import uk.gov.nationalarchives.droid.signatureFile.FFSignatureFile;
import uk.gov.nationalarchives.droid.signatureFile.FileFormat;
import uk.gov.nationalarchives.droid.signatureFile.InternalSignature;
import uk.gov.nationalarchives.droid.xmlReader.SAXModelBuilder;
import com.gc.iotools.fmt.base.DetectionLibrary;
import com.gc.iotools.fmt.base.FormatEnum;
import com.gc.iotools.fmt.base.FormatId;
import com.gc.iotools.fmt.base.ResettableInputStream;
/**
* Implementation of a FileDetector that relies on droid classes.
*
* @author dvd.smnt
*/
public class DroidDetectorImpl implements DetectionLibrary {
/**
* Namespace for the xml file format signatures file.
*/
private static final String SIGNATURE_FILE_NS = "http://www.nationalarchives.gov.uk/pronom/SignatureFile";
private static final String SIGNATURE_FILE = "DROID_SignatureFile_V18.xml";
private static final String MAPPING_FILE = "mapping.properties";
private final Class extends FormatEnum> formatEnumClass;
private static final Map CONF_MAP = new HashMap();
private final String configFile;
private final Properties mapping;
private static final Logger LOG = LoggerFactory
.getLogger(DroidDetectorImpl.class);
public DroidDetectorImpl() {
this(FormatEnum.class, SIGNATURE_FILE, MAPPING_FILE);
}
public DroidDetectorImpl(
final Class extends FormatEnum> formatEnumClass,
final String signatureFile, final String mappingFileStr) {
final Class extends FormatEnum> clazz = (formatEnumClass == null ? FormatEnum.class
: formatEnumClass);
if (!(FormatEnum.class.isAssignableFrom(clazz))) {
throw new IllegalArgumentException(" [" + formatEnumClass
+ "] should be an subclass of [" + FormatEnum.class + "]");
}
this.formatEnumClass = clazz;
this.configFile = StringUtils.isBlank(signatureFile) ? SIGNATURE_FILE
: signatureFile;
if (!CONF_MAP.containsKey(this.configFile)) {
final URL confFile = DroidDetectorImpl.class
.getResource(this.configFile);
try {
if ((confFile == null) || (confFile.openConnection() == null)) {
throw new IllegalArgumentException("Configuration file ["
+ signatureFile + "] not found or not readable.");
}
} catch (final IOException e) {
throw new IllegalArgumentException(
"Problem reading configuration file ["
+ signatureFile + "] url[" + confFile + "]",
e);
}
final FFSignatureFile fsigfile = parseSigFile(confFile);
CONF_MAP.put(this.configFile, fsigfile);
}
this.mapping = new Properties();
try {
final InputStream mappingIs = DroidDetectorImpl.class
.getResourceAsStream(mappingFileStr);
this.mapping.load(mappingIs);
mappingIs.close();
} catch (final IOException e) {
throw new IllegalArgumentException("can't load resource["
+ mappingFileStr + "]", e);
}
}
/**
* {@inheritDoc}
*/
public FormatId detect(final FormatEnum[] enabledFormats,
final ResettableInputStream stream) throws IOException {
final IdentificationFile idFile = new IdentificationFile("-");
// File file = File.createTempFile("io-tools-doc", ".doc");
// FileOutputStream fos = new FileOutputStream(file);
// IOUtils.copy(stream, fos);
// fos.close();
// System.out.println("Data written to [" + file.getName() + "]");
stream.resetToBeginning();
final ByteReader testFile = new RandomAccessByteReader(idFile, stream);
// FileByteReader testFile = new FileByteReader(new IdentificationFile(
// file.getAbsolutePath()), true, file.getAbsolutePath());
final FFSignatureFile fsigfile = CONF_MAP.get(this.configFile);
FFSignatureFile reduced = reduceDetectedSequences(fsigfile,
enabledFormats);
reduced.runFileIdentification(testFile);
final int n = testFile.getNumHits();
FormatId fenumId = new FormatId(FormatEnum.UNKNOWN, null);
final Collection enabledFormatCollection = Arrays
.asList(enabledFormats);
for (int i = 0; (i < n)
&& (FormatEnum.UNKNOWN.equals(fenumId.format)); i++) {
final FileFormatHit ffhit = testFile.getHit(i);
final uk.gov.nationalarchives.droid.signatureFile.FileFormat fileFormat = ffhit
.getFileFormat();
final FormatId tmpFid = getFormatEnum(fileFormat);
if (FormatEnum.UNLISTED.equals(tmpFid.format)) {
LOG.warn("Format number[" + fileFormat.getID()
+ "] not found in configured mapping. format ["
+ fileFormat.getName() + "] was returned as ["
+ FormatEnum.UNLISTED + "] version["
+ fileFormat.getName() + "]");
}
if (enabledFormatCollection.contains(tmpFid.format)) {
fenumId = tmpFid;
}
}
return fenumId;
}
private FFSignatureFile reduceDetectedSequences(FFSignatureFile fsig,
FormatEnum[] enabled) {
Collection fformats = new ArrayList();
Collection intSigs = new ArrayList();
Collection enabledColl = new ArrayList(
Arrays.asList(enabled));
Map internalSignatureMap = new HashMap();
for (int i = 0; i < fsig.getNumInternalSignatures(); i++) {
InternalSignature intSignature = fsig.getInternalSignature(i);
internalSignatureMap.put(intSignature.getID(), intSignature);
}
int n = fsig.getNumFileFormats();
for (int i = 0; i < n; i++) {
FileFormat ff = fsig.getFileFormat(i);
final FormatId formatEnum = getFormatEnum(ff);
if (ff.getNumInternalSignatures() > 0
&& enabledColl.contains(formatEnum.format)) {
fformats.add(ff);
for (int j = 0; j < ff.getNumInternalSignatures(); j++) {
int intSigId = ff.getInternalSignatureID(j);
InternalSignature intSig = internalSignatureMap
.get(intSigId);
if (intSig != null) {
intSigs.add(intSig);
} else {
LOG.warn("Internal signature id[" + intSig
+ "] not found.");
}
}
}
}
return new FFSignatureFile(fformats, intSigs);
}
/**
* {@inheritDoc}
*/
public FormatEnum[] getDetectedFormats() {
final FFSignatureFile fsigfile = CONF_MAP.get(this.configFile);
final Collection result = new ArrayList();
for (int i = 0; i < fsigfile.getNumFileFormats(); i++) {
final FileFormat fformat = fsigfile.getFileFormat(i);
final FormatId fid = getFormatEnum(fformat);
final FormatEnum fenum = fid.format;
if (!FormatEnum.UNLISTED.equals(fenum)
&& !FormatEnum.UNKNOWN.equals(fenum)) {
result.add(fenum);
}
}
return result.toArray(new FormatEnum[result.size()]);
}
private FormatId getFormatEnum(final FileFormat id) {
FormatEnum fenum = null;
for (final Object key : this.mapping.keySet()) {
final String value = this.mapping.getProperty((String) key);
if (value.contains(Integer.toString(id.getID()))) {
final String kname = (String) key;
fenum = FormatEnum.getEnum(this.formatEnumClass, kname);
break;
}
}
FormatId result;
if (fenum == null) {
result = new FormatId(FormatEnum.UNLISTED, id.getName());
} else {
result = new FormatId(fenum, id.getVersion());
}
return result;
}
private XMLReader getXMLReader(final SAXModelBuilder mb) throws Exception {
final SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
// factory.setValidating(true);
final SAXParser saxParser = factory.newSAXParser();
final XMLReader parser = saxParser.getXMLReader();
// URL url = DroidDetectorImpl.class
// .getResource("DROID_SignatureFile.xsd");
// parser.setProperty(
// "http://java.sun.com/xml/jaxp/properties/schemaSource", url);
mb.setupNamespace(SIGNATURE_FILE_NS, true);
parser.setContentHandler(mb);
return parser;
}
// private void addFiles() {
// // Process each file in array of filenames
// for (int fileNum = 0; fileNum < this.myFiles.length
// && !this.myAnalysisController.isAnalysisCancelled(); fileNum++) {
// this.logger.recordPath(this.myFiles[fileNum]);
// if (UrlByteReader.isURL(this.myFiles[fileNum])) {
// // File is a URL
// processFile(new IdentificationFile(this.myFiles[fileNum]));
// } else if (InputStreamByteReader
// .isInputStream(this.myFiles[fileNum])) {
// // File is an input stream
// processFile(new IdentificationFile(this.myFiles[fileNum]));
// } else {
// // File is local
// addFile(this.myFiles[fileNum]);
// }
// }
// }
private FFSignatureFile parseSigFile(final URL signatureFileURL) {
final SAXModelBuilder mb = new SAXModelBuilder();
try {
final XMLReader parser = getXMLReader(mb);
final InputStream signatureFileStream = signatureFileURL
.openStream();
parser.parse(new InputSource(signatureFileStream));
signatureFileStream.close();
} catch (final Exception e) {
throw new IllegalStateException(
"Error reading configuration file " + "["
+ signatureFileURL + "]", e);
}
final FFSignatureFile fsgf = (FFSignatureFile) mb.getModel();
fsgf.prepareForUse();
return fsgf;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy