All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.extractor.ExtractorDOC Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.6.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.extractor;

import java.io.InputStream;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.archive.io.ReplayInputStream;
import org.archive.io.SeekReader;
import org.archive.io.SeekReaderCharSequence;
import org.archive.modules.CrawlURI;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.ms.Doc;

/**
 *  This class allows the caller to extract href style links from word97-format word documents.
 *
 * @author Parker Thompson
 *
 */
public class ExtractorDOC extends ContentExtractor {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 3L;
    
    private static Pattern PATTERN = Pattern.compile("HYPERLINK.*?\"(.*?)\"");

    private static Logger logger =
        Logger.getLogger("org.archive.crawler.extractor.ExtractorDOC");

    public ExtractorDOC() {
    }

    
    @Override
    protected boolean shouldExtract(CrawlURI uri) {
        String mimeType = uri.getContentType();
        if (mimeType == null) {
            return false;
        }
        return mimeType.toLowerCase().startsWith("application/msword");
    }
    
    
    /**
     *  Processes a word document and extracts any hyperlinks from it.
     *  This only extracts href style links, and does not examine the actual
     *  text for valid URIs.
     * @param curi CrawlURI to process.
     */
    protected boolean innerExtract(CrawlURI curi){
        int links = 0;
        InputStream contentStream = null;
        ReplayInputStream documentStream = null; 
        SeekReader docReader = null;

        // Get the doc as a repositionable reader
        try
        {
            contentStream = curi.getRecorder().getContentReplayInputStream();
            if (contentStream==null) {
                // TODO: note problem
                return false;
            }
            documentStream = new ReplayInputStream(contentStream);
           
            
            docReader = Doc.getText(documentStream);
        } catch(Exception e){
            curi.getNonFatalFailures().add(e);
            return false;
        } finally {
            IOUtils.closeQuietly(contentStream); 
        }

        CharSequence cs = new SeekReaderCharSequence(docReader, 0);
        Matcher m = PATTERN.matcher(cs);
        while (m.find()) {
            links++;
            addLink(curi, m.group(1));
        }
        documentStream.destroy(); 
        logger.fine(curi + " has " + links + " links.");
        return true;
    }
    
    
    private void addLink(CrawlURI curi, String hyperlink) {
        try {
            UURI dest = UURIFactory.getInstance(curi.getUURI(), hyperlink);
            LinkContext lc = LinkContext.NAVLINK_MISC;
            addOutlink(curi, hyperlink, lc, Hop.NAVLINK);
        } catch (URIException e1) {
            logUriError(e1, curi.getUURI(), hyperlink);
        }
        numberOfLinksExtracted.incrementAndGet();   
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy