All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.bl.wa.analyser.payload.WARCPayloadAnalysers Maven / Gradle / Ivy

There is a newer version: 3.3.0
Show newest version
/**
 * 
 */
package uk.bl.wa.analyser.payload;

/*
 * #%L
 * warc-indexer
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * .
 * #L%
 */

import java.io.InputStream;

import org.apache.commons.codec.binary.Hex;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.archive.io.ArchiveRecordHeader;
import org.archive.url.UsableURI;
import org.archive.url.UsableURIFactory;

import com.google.common.base.Splitter;
import com.typesafe.config.Config;

import uk.bl.wa.nanite.droid.DroidDetector;
import uk.bl.wa.solr.SolrFields;
import uk.bl.wa.solr.SolrRecord;
import uk.bl.wa.solr.TikaExtractor;
import uk.bl.wa.util.Instrument;
import uk.bl.wa.util.Normalisation;
import uk.gov.nationalarchives.droid.command.action.CommandExecutionException;


/**
 * 
 * TODO Entropy, compressibility, fuzzy hashes, etc.
 * 
 * @author anj
 *
 */
public class WARCPayloadAnalysers {
    private static Log log = LogFactory.getLog( WARCPayloadAnalysers.class );
    
    private boolean passUriToFormatTools = false;
    private TikaExtractor tika = null;
    private DroidDetector dd = null;
    private boolean runDroid = true;
    private boolean droidUseBinarySignaturesOnly = false;

    private boolean extractContentFirstBytes = true;
    private int firstBytesLength = 32;

    public HTMLAnalyser html;
    public PDFAnalyser pdf;
    public XMLAnalyser xml;
    public ImageAnalyser image;

    private boolean extractApachePreflightErrors;
    private boolean extractImageFeatures;

    public WARCPayloadAnalysers(Config conf) {
        this.extractContentFirstBytes = conf.getBoolean( "warc.index.extract.content.first_bytes.enabled" );
        this.firstBytesLength = conf.getInt( "warc.index.extract.content.first_bytes.num_bytes" );
        log.info("first_bytes config: " + this.extractContentFirstBytes + " "
                + this.firstBytesLength);
        this.runDroid = conf.getBoolean( "warc.index.id.droid.enabled" );
        this.passUriToFormatTools = conf.getBoolean( "warc.index.id.useResourceURI" );
        this.droidUseBinarySignaturesOnly = conf.getBoolean( "warc.index.id.droid.useBinarySignaturesOnly" );

        this.extractApachePreflightErrors = conf.getBoolean( "warc.index.extract.content.extractApachePreflightErrors" );
        this.extractImageFeatures = conf.getBoolean("warc.index.extract.content.images.enabled");
        log.info("Image feature extraction = " + this.extractImageFeatures);
        
        // Attempt to set up Droid:
        try {
            dd = new DroidDetector();
            dd.setBinarySignaturesOnly( droidUseBinarySignaturesOnly );
        } catch( CommandExecutionException e ) {
            e.printStackTrace();
            dd = null;
        }
        
        // Set up Tika:
        tika = new TikaExtractor( conf );
        
        // Set up other extractors:
        html = new HTMLAnalyser(conf);
        if (this.extractApachePreflightErrors) {
            pdf = new PDFAnalyser(conf);
        }
        xml = new XMLAnalyser(conf);
        if (this.extractImageFeatures) {
            image = new ImageAnalyser(conf);
        }
        Instrument.createSortedStat("WARCPayloadAnalyzers.analyze#droid", Instrument.SORT.avgtime, 5);
    }
    
    public void analyse(String source, ArchiveRecordHeader header, InputStream tikainput, SolrRecord solr) {
        final String url = Normalisation.sanitiseWARCHeaderValue(header.getUrl());
        log.debug("Analysing " + url);

        final long start = System.nanoTime();
        // Analyse with tika:
        try {
            if( passUriToFormatTools ) {
                solr = tika.extract(source, solr, tikainput, url );
            } else {
                solr = tika.extract(source, solr, tikainput, null );
            }
        } catch( Exception i ) {
            log.error( i + ": " + i.getMessage() + ";tika; " + url + "@" + header.getOffset() );
        }
        Instrument.timeRel("WARCPayloadAnalyzers.analyze#total",
                           "WARCPayloadAnalyzers.analyze#tikasolrextract", start);

        final long firstBytesStart = System.nanoTime();
        // Pull out the first few bytes, to hunt for new format by magic:
        try {
            tikainput.reset();
            byte[] ffb = new byte[ this.firstBytesLength ];
            int read = tikainput.read( ffb );
            if( read >= 4 ) {
                String hexBytes = Hex.encodeHexString( ffb );
                solr.addField( SolrFields.CONTENT_FFB, hexBytes.substring( 0, 2 * 4 ) );
                StringBuilder separatedHexBytes = new StringBuilder();
                for( String hexByte : Splitter.fixedLength( 2 ).split( hexBytes ) ) {
                    separatedHexBytes.append( hexByte );
                    separatedHexBytes.append( " " );
                }
                if( this.extractContentFirstBytes ) {
                    solr.addField( SolrFields.CONTENT_FIRST_BYTES, separatedHexBytes.toString().trim() );
                }
            }
        } catch( Exception i ) {
            log.error( i + ": " + i.getMessage() + ";ffb; " + url + "@" + header.getOffset() );
        }
        Instrument.timeRel("WARCPayloadAnalyzers.analyze#total",
                           "WARCPayloadAnalyzers.analyze#firstbytes", firstBytesStart);

        // Also run DROID (restricted range):
        if( dd != null && runDroid == true ) {
            final long droidStart = System.nanoTime();
            try {
                tikainput.reset();
                // Pass the URL in so DROID can fall back on that:
                Metadata metadata = new Metadata();
                if( passUriToFormatTools ) {
                    UsableURI uuri = UsableURIFactory.getInstance(Normalisation.fixURLErrors(url) );
                    // Droid seems unhappy about spaces in filenames, so hack to avoid:
                    String cleanUrl = uuri.getName().replace( " ", "+" );
                    metadata.set( Metadata.RESOURCE_NAME_KEY, cleanUrl );
                }
                // Run Droid:
                MediaType mt = dd.detect( tikainput, metadata );
                solr.addField( SolrFields.CONTENT_TYPE_DROID, mt.toString() );
                Instrument.timeRel("WARCPayloadAnalyzers.analyze#droid",
                                   "WARCPayloadAnalyzers.analyze#droid_type=" + mt.toString(),
                                   droidStart);
            } catch( Exception i ) {
                // Note that DROID complains about some URLs with an IllegalArgumentException.
                log.error(i + ": " + i.getMessage() + ";dd; " + url + " @" + header.getOffset(), i);
            }
            Instrument.timeRel("WARCPayloadAnalyzers.analyze#total",
                               "WARCPayloadAnalyzers.analyze#droid", droidStart);

        }

     
        try {
            tikainput.reset();
            String mime = ( String ) solr.getField( SolrFields.SOLR_CONTENT_TYPE ).getValue();
            if( mime.startsWith( "text" ) || mime.startsWith("application/xhtml+xml") ) {
                html.analyse(header, tikainput, solr);

            } else if( mime.startsWith( "image" ) ) {
                if( this.extractImageFeatures ) {
                    image.analyse(header, tikainput, solr);
                }
                
            } else if( mime.startsWith( "application/pdf" ) ) {
                if( extractApachePreflightErrors ) {
                    pdf.analyse(header, tikainput, solr);
                }
                
            } else if( mime.startsWith("application/xml") || mime.startsWith("text/xml") ) {
                xml.analyse(header, tikainput, solr);
                
            } else {
                log.debug("No specific additional parser for: "+mime);
            }
        } catch( Exception i ) {
            log.error(i + ": " + i.getMessage() + ";x; " + url + "@"
                    + header.getOffset(), i);
        }
        Instrument.timeRel("WARCIndexer.extract#analyzetikainput",
                           "WARCPayloadAnalyzers.analyze#total", start);

    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy