All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.bl.wa.indexer.WARCIndexer Maven / Gradle / Ivy

There is a newer version: 3.3.0
Show newest version
package uk.bl.wa.indexer;

import static org.archive.format.warc.WARCConstants.HEADER_KEY_ID;
import static org.archive.format.warc.WARCConstants.HEADER_KEY_IP;

/*
 * #%L
 * warc-indexer
 * $Id:$
 * $HeadURL:$
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * .
 * #L%
 */

import static org.archive.format.warc.WARCConstants.HEADER_KEY_TYPE;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.TimeZone;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.ProtocolException;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpHeaders;
import org.apache.log4j.PropertyConfigurator;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.tika.mime.MediaType;
import org.archive.format.warc.WARCConstants;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCRecord;
import org.archive.url.SURT;
import org.archive.url.UsableURI;
import org.archive.url.UsableURIFactory;
import org.archive.util.ArchiveUtils;
import org.archive.util.SurtPrefixSet;
import org.archive.wayback.accesscontrol.staticmap.StaticMapExclusionFilterFactory;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.resourceindex.filters.ExclusionFilter;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;

import com.google.common.collect.ImmutableList;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import com.typesafe.config.ConfigRenderOptions;

import uk.bl.wa.analyser.payload.ARCNameAnalyser;
import uk.bl.wa.analyser.payload.WARCPayloadAnalysers;
import uk.bl.wa.analyser.text.TextAnalysers;
import uk.bl.wa.annotation.Annotations;
import uk.bl.wa.annotation.Annotator;
import uk.bl.wa.extract.LinkExtractor;
import uk.bl.wa.parsers.HtmlFeatureParser;
import uk.bl.wa.solr.SolrFields;
import uk.bl.wa.solr.SolrRecord;
import uk.bl.wa.solr.SolrRecordFactory;
import uk.bl.wa.solr.SolrWebServer;
import uk.bl.wa.util.HashedCachedInputStream;
import uk.bl.wa.util.Instrument;
import uk.bl.wa.util.Normalisation;

/**
 * 
 * Core indexer class that takes a web archive record and generates a Solr record.
 * 
 * TODO Currently a rather crude, monolithic code structure. Should pull the different metadata generation logic out into separate classes or at least methods.
 * 
 * @author Andrew Jackson 
 * 
 */
public class WARCIndexer {
    private static Log log = LogFactory.getLog( WARCIndexer.class );

    private List url_excludes;
    private List protocol_includes;
    private List response_includes;
    private List record_type_includes;

    private MessageDigest md5 = null;

    /** */
    private boolean extractText;
    private boolean storeText;
    private boolean hashUrlId;

    /** Wayback-style URI filtering: */
    private StaticMapExclusionFilterFactory smef = null;

    /** Hook to the solr server: */
    private boolean checkSolrForDuplicates = false;
    private SolrWebServer solrServer = null;
    
    /** Payload Analysers */
    private long inMemoryThreshold;
    private long onDiskThreshold;
    private WARCPayloadAnalysers wpa;
    
    /** Text Analysers */
    private TextAnalysers txa;

    //Generate fields from regexp on warc-filepath
    ARCNameAnalyser arcname;
    
    /** Annotations */
    private Annotator ant = null;

    // Paired with HtmlFeatureParsers links-extractor
    private final boolean addNormalisedURL;

    // Also canonicalise the HOST field (e.g. drop "www.")
    public static final boolean CANONICALISE_HOST = true;

    private final SolrRecordFactory solrFactory;

    /* ------------------------------------------------------------ */

    /**
     * Default constructor, with empty configuration.
     */
    public WARCIndexer() throws NoSuchAlgorithmException {
        this( ConfigFactory.parseString( ConfigFactory.load().root().render( ConfigRenderOptions.concise() ) ) );
    }

    /**
     * Preferred constructor, allows passing in configuration from execution environment.
     */
    public WARCIndexer( Config conf ) throws NoSuchAlgorithmException {
        log.info("Initialising WARCIndexer...");
        try {
            Properties props = new Properties();
            props.load(getClass().getResourceAsStream("/log4j-override.properties"));
            PropertyConfigurator.configure(props);
        } catch (IOException e1) {
            log.error("Failed to load log4j config from properties file.");
        }
        solrFactory = SolrRecordFactory.createFactory(conf);
        // Optional configurations:
        this.extractText = conf.getBoolean( "warc.index.extract.content.text" );
        log.info("Extract text = " + extractText);
        this.storeText = conf
                .getBoolean("warc.index.extract.content.text_stored");
        log.info("Store text = " + storeText);
        this.hashUrlId = conf.getBoolean( "warc.solr.use_hash_url_id" );
        log.info("hashUrlId = " + hashUrlId);
        addNormalisedURL = conf.hasPath(HtmlFeatureParser.CONF_LINKS_NORMALISE) ?
                conf.getBoolean(HtmlFeatureParser.CONF_LINKS_NORMALISE) :
                HtmlFeatureParser.DEFAULT_LINKS_NORMALISE;
        this.checkSolrForDuplicates = conf.getBoolean("warc.solr.check_solr_for_duplicates");
        if( this.hashUrlId == false && this.checkSolrForDuplicates == true ) {
            log.warn("Checking Solr for duplicates may not work as expected when using the timestamp+md5(URL) key.");
            log.warn("You need to use the payload-hash+md5(URL) key option to resolve revisit records.");
        }
        // URLs to exclude:
        this.url_excludes = conf.getStringList( "warc.index.extract.url_exclude" );
        // Protocols to include:
        this.protocol_includes = conf.getStringList( "warc.index.extract.protocol_include" );
        // Response codes to include:
        this.response_includes = conf.getStringList( "warc.index.extract.response_include" );
        // Record types to include:
        this.record_type_includes = conf.getStringList( "warc.index.extract.record_type_include" );

        // URL Filtering options:
        if( conf.getBoolean( "warc.index.exclusions.enabled" ) ) {
            smef = new StaticMapExclusionFilterFactory();
            smef.setFile( conf.getString( "warc.index.exclusions.file" ) );
            smef.setCheckInterval( conf.getInt( "warc.index.exclusions.check_interval" ) );
            try {
                smef.init();
            } catch( IOException e ) {
                log.error( "Failed to load exclusions file." );
                throw new RuntimeException( "StaticMapExclusionFilterFactory failed with IOException when loading " + smef.getFile() );
            }
        }

        // Instanciate required helpers:
        md5 = MessageDigest.getInstance( "MD5" );
        
        // Also hook up to Solr server for queries:
        if( this.checkSolrForDuplicates ) {
            log.info("Initialisating connection to Solr...");
            solrServer = new SolrWebServer(conf);
        }
        
        // Set up hash-cache properties:
        this.inMemoryThreshold = conf.getBytes( "warc.index.extract.inMemoryThreshold" );
        this.onDiskThreshold = conf.getBytes( "warc.index.extract.onDiskThreshold" );
        log.info("Hashing & Caching thresholds are: < "+this.inMemoryThreshold+" in memory, < "+this.onDiskThreshold+" on disk.");
        
        // Set up analysers
        log.info("Setting up analysers...");
        this.wpa = new WARCPayloadAnalysers(conf);
        this.txa = new TextAnalysers(conf);


        // Set up annotator
        if (conf.hasPath("warc.index.extract.content.annotations.enabled") && conf.getBoolean("warc.index.extract.content.annotations.enabled")) {
            String annotationsFile = conf.getString("warc.index.extract.content.annotations.file");
            String openAccessSurtsFile = conf.getString("warc.index.extract.content.annotations.surt_prefix_file");
            try {
                Annotations ann = Annotations.fromJsonFile(annotationsFile);
                SurtPrefixSet oaSurts = Annotator.loadSurtPrefix(openAccessSurtsFile);
                this.ant = new Annotator(ann, oaSurts);
            } catch (IOException e) {
                log.error("Failed to load annotations files.");
                throw new RuntimeException("Annotations failed with IOException when loading files " + annotationsFile + ", " + openAccessSurtsFile);
            }
        }

        this.arcname = new ARCNameAnalyser(conf);
    
        // We want stats for the 20 resource types that we spend the most time processing
        Instrument.createSortedStat("WARCIndexer#content_types", Instrument.SORT.time, 20);

        // Log so it's clear this completed ok:
        log.info("Initialisation of WARCIndexer complete.");
    }

    /**
     * 
     * @param ann
     */
    public void setAnnotations(Annotations ann, SurtPrefixSet openAccessSurts) {
        this.ant = new Annotator(ann, openAccessSurts);
    }

    /**
     * @return the checkSolrForDuplicates
     */
    public boolean isCheckSolrForDuplicates() {
        return checkSolrForDuplicates;
    }

    /**
     * @param checkSolrForDuplicates the checkSolrForDuplicates to set
     */
    public void setCheckSolrForDuplicates(boolean checkSolrForDuplicates) {
        this.checkSolrForDuplicates = checkSolrForDuplicates;
    }

    /**
     * This extracts metadata and text from the ArchiveRecord and creates a suitable SolrRecord.
     * 
     * @param archiveName
     * @param record
     * @return
     * @throws IOException
     */
    public SolrRecord extract( String archiveName, ArchiveRecord record ) throws IOException {
        return this.extract( archiveName, record, this.extractText );
    }

    /**
     * This extracts metadata from the ArchiveRecord and creates a suitable SolrRecord.
     * Removes the text field if flag set.
     * 
     * @param archiveName
     * @param record
     * @param isTextIncluded
     * @return
     * @throws IOException
     */
    public SolrRecord extract( String archiveName, ArchiveRecord record, boolean isTextIncluded ) throws IOException {      
      final long start = System.nanoTime();
        ArchiveRecordHeader header = record.getHeader();
        SolrRecord solr = solrFactory.createRecord(archiveName, header);
        
        if( !header.getHeaderFields().isEmpty() ) {
            if( header.getHeaderFieldKeys().contains( HEADER_KEY_TYPE ) ) {
                log.debug("Looking at "
                        + header.getHeaderValue(HEADER_KEY_TYPE));

                if( !checkRecordType( ( String ) header.getHeaderValue( HEADER_KEY_TYPE ) ) ) {
                    return null;
                }
                // Store WARC record type:
                solr.setField(SolrFields.SOLR_RECORD_TYPE, (String) header.getHeaderValue(HEADER_KEY_TYPE));

                //Store WARC-Record-ID
                solr.setField(SolrFields.WARC_KEY_ID, (String) header.getHeaderValue(HEADER_KEY_ID));
                solr.setField(SolrFields.WARC_IP, (String) header.getHeaderValue(HEADER_KEY_IP));
                
            } else {
                // else we're processing ARCs so nothing to filter and no
                // revisits
                solr.setField(SolrFields.SOLR_RECORD_TYPE, "arc");
            }

            if( header.getUrl() == null )
                return null;

            // Get the URL:
            String targetUrl = Normalisation.sanitiseWARCHeaderValue(header.getUrl());

            // Strip down very long URLs to avoid
            // "org.apache.commons.httpclient.URIException: Created (escaped)
            // uuri > 2083"
            // Trac #2271: replace string-splitting with URI-based methods.
            if (targetUrl.length() > 2000)
                targetUrl = targetUrl.substring(0, 2000);

            log.debug("Current heap usage: "
                    + FileUtils.byteCountToDisplaySize(Runtime.getRuntime()
                            .totalMemory()));
            log.debug("Processing " + targetUrl + " from " + archiveName);

            // Check the filters:
            if( this.checkProtocol( targetUrl ) == false )
                return null;
            if( this.checkUrl( targetUrl ) == false )
                return null;
            if( this.checkExclusionFilter( targetUrl ) == false )
                return null;
                
            // -----------------------------------------------------
            // Add user supplied Archive-It Solr fields and values:
            // -----------------------------------------------------
            solr.setField( SolrFields.INSTITUTION, WARCIndexerCommand.institution );
            solr.setField( SolrFields.COLLECTION, WARCIndexerCommand.collection );
            solr.setField( SolrFields.COLLECTION_ID, WARCIndexerCommand.collection_id );

            // --- Basic headers ---

            // Basic metadata:
            solr.setField(SolrFields.SOURCE_FILE, archiveName);
            solr.setField(SolrFields.SOURCE_FILE_OFFSET,"" + header.getOffset());
            solr.setField(SolrFields.SOURCE_FILE_PATH, header.getReaderIdentifier()); //Full path of file
            
            byte[] url_md5digest = md5
                    .digest(Normalisation.sanitiseWARCHeaderValue(header.getUrl()).getBytes("UTF-8"));
            // String url_base64 =
            // Base64.encodeBase64String(fullUrl.getBytes("UTF-8"));
            String url_md5hex = Base64.encodeBase64String(url_md5digest);
            solr.setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl()));
            if (addNormalisedURL) {
                solr.setField( SolrFields.SOLR_URL_NORMALISED, Normalisation.canonicaliseURL(targetUrl) );
            }

            // Get the length, but beware, this value also includes the HTTP headers (i.e. it is the payload_length):
            long content_length = header.getLength();

            // Also pull out the file extension, if any:
            String resourceName = parseResourceName(targetUrl);
            solr.addField(SolrFields.RESOURCE_NAME, resourceName);
            solr.addField(SolrFields.CONTENT_TYPE_EXT,
                    parseExtension(resourceName));

            // Add URL-based fields:
            URI saneURI = parseURL(solr, targetUrl);

            Instrument.timeRel("WARCIndexer.extract#total",
                               "WARCIndexer.extract#archeaders", start);

            InputStream tikainput = null;

            // Only parse HTTP headers for HTTP URIs
            if( targetUrl.startsWith( "http" ) ) {
                // Parse HTTP headers:
                String statusCode = null;
                if( record instanceof WARCRecord ) {
                    // There are not always headers! The code should check first.
                    String statusLine = HttpParser.readLine( record, "UTF-8" );
                    if( statusLine != null && statusLine.startsWith( "HTTP" ) ) {
                        String firstLine[] = statusLine.split( " " );
                        if( firstLine.length > 1 ) {
                            statusCode = firstLine[ 1 ].trim();
                            try {
                                this.processHeaders( solr, statusCode, HttpParser.parseHeaders( record, "UTF-8" ), targetUrl );
                            } catch( ProtocolException p ) {
                                log.error( "ProtocolException [" + statusCode + "]: " + header.getHeaderValue( WARCConstants.HEADER_KEY_FILENAME ) + "@" + header.getHeaderValue( WARCConstants.ABSOLUTE_OFFSET_KEY ), p );
                            }
                        } else {
                            log.warn( "Could not parse status line: " + statusLine );
                        }
                    } else {
                        log.warn( "Invalid status line: " + header.getHeaderValue( WARCConstants.HEADER_KEY_FILENAME ) + "@" + header.getHeaderValue( WARCConstants.ABSOLUTE_OFFSET_KEY ) );
                    }
                    // No need for this, as the headers have already been read from the InputStream (above):
                    // WARCRecordUtils.getPayload(record);
                    tikainput = record;
                } else if( record instanceof ARCRecord ) {
                    ARCRecord arcr = ( ARCRecord ) record;
                    statusCode = "" + arcr.getStatusCode();
                    this.processHeaders( solr, statusCode, arcr.getHttpHeaders() , targetUrl);
                    arcr.skipHttpHeader();
                    tikainput = arcr;
                } else {
                    log.error( "FAIL! Unsupported archive record type." );
                    return solr;
                }

                solr.setField(SolrFields.SOLR_STATUS_CODE, statusCode);

                // Skip recording non-content URLs (i.e. 2xx responses only please):
                if(!checkResponseCode(statusCode)) {
                    log.debug( "Skipping this record based on status code " + statusCode + ": " + targetUrl );
                    return null;
                }
            } else {
                log.info("Skipping header parsing as URL does not start with 'http'");
            }
            
            // Update the content_length based on what's available:
            content_length = tikainput.available();
            // Record the length:
            solr.setField(SolrFields.CONTENT_LENGTH, ""+content_length);
            
            // -----------------------------------------------------
            // Headers have been processed, payload ready to cache:
            // -----------------------------------------------------
            
            // Create an appropriately cached version of the payload, to allow analysis.
            final long hashStreamStart = System.nanoTime();
            HashedCachedInputStream hcis = new HashedCachedInputStream(header, tikainput, content_length );
            tikainput = hcis.getInputStream();
            String hash = hcis.getHash();
            Instrument.timeRel("WARCIndexer.extract#total",
                               "WARCIndexer.extract#hashstreamwrap", hashStreamStart);

            // Prepare crawl date information:
            String waybackDate = ( header.getDate().replaceAll( "[^0-9]", "" ) );
            Date crawlDate =  getWaybackDate( waybackDate );
            String crawlDateString = parseCrawlDate(waybackDate);
            
            // Optionally use a hash-based ID to store only one version of a URL:
            String id = null;
            if( hashUrlId ) {
                id = hash + "/" + url_md5hex;
            } else {
                id = waybackDate + "/" + url_md5hex;
            }
            // Set these last:
            solr.setField( SolrFields.ID, id );
            solr.setField( SolrFields.HASH, hash );

            // -----------------------------------------------------
            // Payload has been cached, ready to check crawl dates:
            // -----------------------------------------------------
            
            HashSet currentCrawlDates = new HashSet();
            // If we are collapsing records based on hash:
            if (hashUrlId) {
                // Query for currently known crawl dates:
                if (this.checkSolrForDuplicates && solrServer != null) {
                    SolrQuery q = new SolrQuery("id:\"" + id + "\"");
                    q.addField(SolrFields.CRAWL_DATES);
                    try {
                        QueryResponse results = solrServer.query(q);
                        if (results.getResults().size() > 0) {
                            SolrDocument fr = results.getResults().get(0);
                            if (fr.containsKey(SolrFields.CRAWL_DATES)) {
                                for (Object cds : fr
                                        .getFieldValues(SolrFields.CRAWL_DATES)) {
                                    currentCrawlDates.add((Date) cds);
                                }
                            }
                        } else {
                            log.debug("No matching entries found.");
                        }
                    } catch (SolrServerException e) {
                        e.printStackTrace();
                        // FIXME retry?
                    }
                }

                // Is the current date unknown? (inc. no-solr-check case):
                if (!currentCrawlDates.contains(crawlDate)) {
                    // Dates to be merged under the CRAWL_DATES field:
                    solr.mergeField(SolrFields.CRAWL_DATES, crawlDateString);
                    solr.mergeField(SolrFields.CRAWL_YEARS,
                            extractYear(header.getDate()));
                } else {
                    // Otherwise, ensure the all the known dates (i.e. including
                    // this one) are copied over:
                    for (Date ccd : currentCrawlDates) {
                        solr.addField(SolrFields.CRAWL_DATES,
                                formatter.format(ccd));
                        solr.addField(SolrFields.CRAWL_YEARS,
                                getYearFromDate(ccd));
                    }
                    // TODO This could optionally skip re-submission instead?
                }
            }
            
            // Sort the dates and find the earliest:
            List dateList = new ArrayList(currentCrawlDates);
            dateList.add(crawlDate);
            Collections.sort(dateList);
            Date firstDate = dateList.get(0);
            solr.setField(SolrFields.CRAWL_DATE,
                    formatter.format(firstDate));
            solr.setField( SolrFields.CRAWL_YEAR, getYearFromDate(firstDate) );
            
            // Use the current value as the waybackDate:
            solr.setField( SolrFields.WAYBACK_DATE, waybackDate );
            
               // Parse ARC name
            if (!arcname.getRules().isEmpty()) {
                final long nameStart = System.nanoTime();
                arcname.analyse(header, tikainput, solr);
                Instrument.timeRel("WARCPayloadAnalyzers.analyze#total",
                                   "WARCPayloadAnalyzers.analyze#arcname", nameStart);
            }        
            
            // -----------------------------------------------------
            // Apply any annotations:
            // -----------------------------------------------------
            if (ant != null) {
                try {
                    ant.applyAnnotations(saneURI,
                            solr.getSolrDocument());
                } catch (URISyntaxException e) {
                    e.printStackTrace();
                    log.error("Failed to annotate " + saneURI + " : " + e);
                }
            }

            // If this is a revisit record, we should just return an update to the crawl_dates (when using hashUrlId)
            if (WARCConstants.WARCRecordType.revisit.name().equalsIgnoreCase((String) header.getHeaderValue(HEADER_KEY_TYPE))) {
                if (currentCrawlDates.contains(crawlDate)) {
                    return null;
                }
                solr.removeField(SolrFields.CONTENT_LENGTH); //It is 0 and would mess with statistics                                                                                
                //Copy content_type_served to content_type (no tika/droid for revisits)
                solr.addField(SolrFields.SOLR_CONTENT_TYPE, (String) solr.getFieldValue(SolrFields.CONTENT_TYPE_SERVED));
                processContentType(solr, header, content_length, true);//The value set above is used here for content_type_norm                
                return solr;
            }

            // -----------------------------------------------------
            // Payload duplication has been checked, ready to parse:
            // -----------------------------------------------------

            final long analyzeStart = System.nanoTime();
            // Mark the start of the payload.
            tikainput.mark( ( int ) content_length );
            
            // Pass on to other extractors as required, resetting the stream before each:
            this.wpa.analyse(archiveName, header, tikainput, solr);
            Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#analyzetikainput", analyzeStart);


            // Clear up the caching of the payload:
            hcis.cleanup();

            // Derive normalised/simplified content type:
            processContentType(solr, header, content_length, false);

            // -----------------------------------------------------
            // Payload analysis complete, now performing text analysis:
            // -----------------------------------------------------
            
            this.txa.analyse(solr);

            // Remove the Text Field if required
            if( !isTextIncluded ) {
                solr.removeField( SolrFields.SOLR_EXTRACTED_TEXT );

            } else {
                // Otherwise, decide whether to store or both store and index
                // the text:
                if (storeText == false) {
                    // Copy the text into the indexed (but not stored) field:
                    solr.setField(SolrFields.SOLR_EXTRACTED_TEXT_NOT_STORED,
                            (String) solr.getField(
                                    SolrFields.SOLR_EXTRACTED_TEXT)
                                    .getFirstValue());
                    // Take the text out of the original (stored) field.
                    solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT);
                }
            }
        }
        Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#solrdocCreation",
                     "WARCIndexer.extract#total", start);
        String servedType = "" + solr.getField(SolrFields.CONTENT_TYPE_SERVED);
        Instrument.timeRel("WARCIndexer#content_types",
                     "WARCIndexer#" + (servedType.contains(";") ? servedType.split(";")[0] : servedType),start);
        Instrument.timeRel("WARCIndexer#content_types", start);
        return solr;
    }

    /**
     * Perform URL parsing and manipulation
     * 
     * @return
     * 
     * @throws URIException
     */
    protected URI parseURL(SolrRecord solr, String fullUrl)
            throws URIException {
        UsableURI url = UsableURIFactory.getInstance(fullUrl);

        solr.setField(SolrFields.SOLR_URL_PATH, url.getPath());

        // Spot 'slash pages':
        if (url.getPath().equals("/") || url.getPath().equals("")
                || url.getPath().matches("/index\\.[a-z]+$")) {
            solr.setField(SolrFields.SOLR_URL_TYPE,
                    SolrFields.SOLR_URL_TYPE_SLASHPAGE);
            // Spot 'robots.txt':
        } else if (url.getPath().equalsIgnoreCase("/robots.txt")) {
            solr.setField(SolrFields.SOLR_URL_TYPE,
                    SolrFields.SOLR_URL_TYPE_ROBOTS_TXT);
        } else {
            solr.setField(SolrFields.SOLR_URL_TYPE,
                    SolrFields.SOLR_URL_TYPE_NORMAL);
        }
        // Record the host (an canonicalised), the domain
        // and the public suffix:
        String host = url.getHost();
        if (CANONICALISE_HOST)
            host = Normalisation.canonicaliseHost(host);
        solr.setField(SolrFields.SOLR_HOST, host);

        // Add the SURT host
        solr.removeField(SolrFields.SOLR_HOST_SURT);
        ImmutableList levels = LinkExtractor.allLevels(host);
        if (levels != null) {
            for (String level : levels) {
                solr.addField(SolrFields.SOLR_HOST_SURT, SURT.toSURT(level));
            }
        }

        final String domain = LinkExtractor.extractPrivateSuffixFromHost(host);
        solr.setField(SolrFields.DOMAIN, domain);
        solr.setField(SolrFields.PUBLIC_SUFFIX,
                LinkExtractor.extractPublicSuffixFromHost(host));

        // Force correct escaping:
        org.apache.commons.httpclient.URI tempUri = new org.apache.commons.httpclient
                .URI(url.getEscapedURI(),
                false);

        return URI.create(tempUri.getEscapedURI());

    }

    private synchronized String getYearFromDate(Date date) {
        calendar.setTime(date);
        return Integer.toString(calendar.get(Calendar.YEAR));
    }

    private final Calendar calendar = Calendar
            .getInstance(TimeZone.getTimeZone("UTC"));

    /* ----------------------------------- */

    private void processHeaders( SolrRecord solr, String statusCode, Header[] httpHeaders , String targetUrl) {
        try {
            // This is a simple test that the status code setting worked:
            int statusCodeInt = Integer.parseInt( statusCode );
            if( statusCodeInt < 0 || statusCodeInt > 1000 )
                throw new Exception( "Status code out of range: " + statusCodeInt );
            // Get the other headers:
            
            for( Header h : httpHeaders ) {
              // Get the type from the server
                if (h.getName().equalsIgnoreCase(HttpHeaders.CONTENT_TYPE)
                        && solr.getField(SolrFields.CONTENT_TYPE_SERVED) == null) {
                    String servedType = h.getValue();
                    if (servedType.length() > 200)
                        servedType = servedType.substring(0, 200);
                    solr.addField(SolrFields.CONTENT_TYPE_SERVED, servedType);
                }
                // Also, grab the X-Powered-By or Server headers if present:
                if (h.getName().equalsIgnoreCase("X-Powered-By"))
                    solr.addField( SolrFields.SERVER, h.getValue() );
                if (h.getName().equalsIgnoreCase(HttpHeaders.SERVER))
                    solr.addField( SolrFields.SERVER, h.getValue() );
                if (h.getName().equalsIgnoreCase(HttpHeaders.LOCATION)){
                    String location = h.getValue(); //This can be relative and must be resolved full                  
                       solr.setField(SolrFields.REDIRECT_TO_NORM,  Normalisation.resolveRelative(targetUrl, location));
                }
                                               
            }
        } catch( NumberFormatException e ) {
            log.error( "Exception when parsing status code: " + statusCode + ": " + e );
            solr.addParseException("when parsing statusCode", e);
        } catch( Exception e ) {
            log.error( "Exception when parsing headers: " + e );
            solr.addParseException("when parsing headers", e);
        }
    }


    /**
     * 
     * @param fullUrl
     * @return
     */
    protected static String parseResourceName(String fullUrl) {
        if( fullUrl.lastIndexOf( "/" ) != -1 ) {
            String path = fullUrl.substring(fullUrl.lastIndexOf("/") + 1);
            if( path.indexOf( "?" ) != -1 ) {
                path = path.substring( 0, path.indexOf( "?" ) );
            }
            if( path.indexOf( "&" ) != -1 ) {
                path = path.substring( 0, path.indexOf( "&" ) );
            }
            return path;
        }
        return null;
    }

    protected static String parseExtension(String path) {
        if (path != null && path.indexOf(".") != -1) {
            String ext = path.substring(path.lastIndexOf("."));
            ext = ext.toLowerCase();
            // Avoid odd/malformed extensions:
            // if( ext.contains("%") )
            // ext = ext.substring(0, path.indexOf("%"));
            ext = ext.replaceAll("[^0-9a-z]", "");
            return ext;
        }
        return null;
    }

    /**
     * Timestamp parsing, for the Crawl Date.
     */

    public static SimpleDateFormat formatter = new SimpleDateFormat(
            "yyyy-MM-dd'T'HH:mm:ss'Z'");
    static {
        formatter.setTimeZone( TimeZone.getTimeZone( "GMT" ) );
    }

    /**
     * Returns a Java Date object representing the crawled date.
     * 
     * @param timestamp
     * @return
     */
    public static Date getWaybackDate( String timestamp ) {
        Date date = new Date();
        try {
            if( timestamp.length() == 12 ) {
                date = ArchiveUtils.parse12DigitDate( timestamp );
            } else if( timestamp.length() == 14 ) {
                date = ArchiveUtils.parse14DigitDate( timestamp );
            } else if( timestamp.length() == 16 ) {
                date = ArchiveUtils.parse17DigitDate( timestamp + "0" );
            } else if( timestamp.length() >= 17 ) {
                date = ArchiveUtils.parse17DigitDate( timestamp.substring( 0, 17 ) );
            }
        } catch( ParseException p ) {
            p.printStackTrace();
        }
        return date;
    }

    /**
     * Returns a formatted String representing the crawled date.
     * 
     * @param waybackDate
     * @return
     */
    protected static String parseCrawlDate( String waybackDate ) {
        DateTimeFormatter iso_df = ISODateTimeFormat.dateTimeNoMillis()
                .withZone(DateTimeZone.UTC);
        return iso_df.print(new org.joda.time.DateTime(
                getWaybackDate(waybackDate)));
    }

    /**
     * 
     * @param timestamp
     * @return
     */
    public static String extractYear( String timestamp ) {
        // Default to 'unknown':
        String waybackYear = "unknown";
        String waybackDate = timestamp.replaceAll( "[^0-9]", "" );
        if( waybackDate != null )
            waybackYear = waybackDate.substring( 0, 4 );
        // Reject bad values by resetting to 'unknown':
        if( "0000".equals( waybackYear ) )
            waybackYear = "unknown";
        // Return
        return waybackYear;
    }

    /**
     * 
     * @param solr
     * @param header
     * @param content_length
     */
    private void processContentType(SolrRecord solr,
            ArchiveRecordHeader header, long content_length, boolean revisit) {
        // Get the current content-type:
        String contentType = ( String ) solr.getFieldValue( SolrFields.SOLR_CONTENT_TYPE );

        // Store the raw content type from Tika:
        solr.setField( SolrFields.CONTENT_TYPE_TIKA, contentType );

        // Also get the other content types:
        MediaType mt_tika = MediaType.parse( contentType );
        if( solr.getField( SolrFields.CONTENT_TYPE_DROID ) != null ) {
            MediaType mt_droid = MediaType.parse( ( String ) solr.getField( SolrFields.CONTENT_TYPE_DROID ).getFirstValue() );
            if( mt_tika == null || mt_tika.equals( MediaType.OCTET_STREAM ) ) {
                contentType = mt_droid.toString();
            } else if( mt_droid.getBaseType().equals( mt_tika.getBaseType() ) && mt_droid.getParameters().get( "version" ) != null ) {
                // Union of results:
                mt_tika = new MediaType( mt_tika, mt_droid.getParameters() );
                contentType = mt_tika.toString();
            }
            if( mt_droid.getParameters().get( "version" ) != null ) {
                solr.addField( SolrFields.CONTENT_VERSION, mt_droid.getParameters().get( "version" ) );
            }
        }

        // Allow header MIME
        if( contentType != null && contentType.isEmpty() ) {
            if( header.getHeaderFieldKeys().contains( "WARC-Identified-Payload-Type" ) ) {
                contentType = ( ( String ) header.getHeaderFields().get( "WARC-Identified-Payload-Type" ) );
            } else {
                contentType = header.getMimetype();
            }
        }
        // Determine content type:
        if( contentType != null )
            solr.setField( SolrFields.FULL_CONTENT_TYPE, contentType );
        
        // If zero-length, then change to application/x-empty for the 'content_type' field.
        if (content_length == 0 && !revisit)
            contentType = "application/x-empty";

        // Content-Type can still be null
        if( contentType != null ) {
            // Strip parameters out of main type field:
            solr.setField( SolrFields.SOLR_CONTENT_TYPE, contentType.replaceAll( ";.*$", "" ) );

            // Also add a more general, simplified type, as appropriate:
            if( contentType.matches( "^image/.*$" ) ) {
                solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "image" );
                solr.setField(SolrFields.SOLR_TYPE, "Image");
            } else if (contentType.matches("^audio/.*$")
                    || contentType.matches("^application/vnd.rn-realaudio$")) {
                solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "audio" );
                solr.setField(SolrFields.SOLR_TYPE, "Audio");
            } else if (contentType.matches("^video/.*$")
                    || contentType.matches("^application/mp4$")
                    || contentType.matches("^application/vnd.rn-realmedia$")) {
                solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "video" );
                solr.setField(SolrFields.SOLR_TYPE, "Video");
            } else if (contentType.matches("^text/htm.*$")
                    || contentType.matches("^application/xhtml.*$")) {
                solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "html" );
                solr.setField(SolrFields.SOLR_TYPE, "Web Page");
            } else if( contentType.matches( "^application/pdf.*$" ) ) {
                solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "pdf" );
                solr.setField(SolrFields.SOLR_TYPE, "Document");
            } else if( contentType.matches( "^.*word$" ) ) {
                solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "word" );
                solr.setField(SolrFields.SOLR_TYPE, "Document");
            } else if( contentType.matches( "^.*excel$" ) ) {
                solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "excel" );
                solr.setField(SolrFields.SOLR_TYPE, "Data");
            } else if( contentType.matches( "^.*powerpoint$" ) ) {
                solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "powerpoint" );
                solr.setField(SolrFields.SOLR_TYPE, "Presentation");
            } else if( contentType.matches( "^text/plain.*$" ) ) {
                solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "text" );
                solr.setField(SolrFields.SOLR_TYPE, "Document");
            } else {
                solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "other" );
                solr.setField(SolrFields.SOLR_TYPE, "Other");
            }

            // Remove text from JavaScript, CSS, ...
            if( contentType.startsWith( "application/javascript" ) || contentType.startsWith( "text/javascript" ) || contentType.startsWith( "text/css" ) ) {
                solr.removeField( SolrFields.SOLR_EXTRACTED_TEXT );
            }
        }
    }

    private boolean checkUrl( String url ) {
        for( String exclude : url_excludes ) {
            if (!"".equalsIgnoreCase(exclude)
                    && url.matches(".*" + exclude + ".*")) {
                return false;
            }
        }
        return true;
    }

    private boolean checkProtocol( String url ) {
        for( String include : protocol_includes ) {
            if ("".equalsIgnoreCase(include) || url.startsWith(include)) {
                return true;
            }
        }
        return false;
    }

    private boolean checkResponseCode( String statusCode ) {
        if( statusCode == null )
            return false;
        // Check for match:
        for( String include : response_includes ) {
            if ("".equalsIgnoreCase(include) || statusCode.startsWith(include)) {
                return true;
            }
        }
        // Exclude
        return false;
    }

    private boolean checkRecordType( String type ) {
        if (record_type_includes.contains(type)) {
                return true;
        }
        log.debug("Skipping record of type " + type);
        return false;
    }

    private boolean checkExclusionFilter( String uri ) {
        // Default to no exclusions:
        if( smef == null )
            return true;
        // Otherwise:
        ExclusionFilter ef = smef.get();
        CaptureSearchResult r = new CaptureSearchResult();
        // r.setOriginalUrl(uri);
        r.setUrlKey( uri );
        try {
            if( ef.filterObject( r ) == ExclusionFilter.FILTER_INCLUDE ) {
                return true;
            }
        } catch( Exception e ) {
            log.error( "Exclusion filtering failed with exception: " + e );
            e.printStackTrace();
        }
        log.debug( "EXCLUDING this URL due to filter: " + uri );
        // Exclude:
        return false;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy