gov.sandia.cognition.text.document.extractor.TextDocumentExtractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cognitive-foundry Show documentation
A single jar with all the Cognitive Foundry components.
There is a newer version: 4.0.1
/*
 * File:                TextDocumentExtractor.java
 * Authors:             Justin Basilico
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry
 * 
 * Copyright February 23, 2009, Sandia Corporation.
 * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive 
 * license for use of this work by or on behalf of the U.S. Government. Export 
 * of this program may require a license from the United States Government. 
 * See CopyrightHistory.txt for complete details.
 * 
 */

package gov.sandia.cognition.text.document.extractor;

import gov.sandia.cognition.io.FileUtil;
import gov.sandia.cognition.text.document.DefaultDocument;
import gov.sandia.cognition.text.document.Document;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URLConnection;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

/**
 * Extracts text from plain text documents.
 * 
 * @author  Justin Basilico
 * @since   3.0
 */
public class TextDocumentExtractor
    extends AbstractSingleDocumentExtractor
{
    /** The content type is {@value}. */
    public static final String CONTENT_TYPE = "text/plain";

    /** The default set of file extensions for text files. */
    public static final List DEFAULT_TEXT_FILE_EXTENSIONS =
        Collections.unmodifiableList(Arrays.asList(new String[] { "txt", "text" }));

    /**
     * Creates a new {@code TextDocumentExtractor}.
     */
    public TextDocumentExtractor()
    {
        super();
    }

    public boolean canExtract(
        final URI uri)
        throws IOException
    {
        // Get the extension of the file.
        final String fileName = uri.getPath();
        final String fileExtension = FileUtil.getExtension(fileName);

        // Match the extension to our list of known extensions.
        if (fileExtension != null)
        {
            final String fileExtensionLowerCase = fileExtension.toLowerCase();

            return DEFAULT_TEXT_FILE_EXTENSIONS.contains(fileExtensionLowerCase);
        }

        return false;
    }
    
    public boolean canExtract(
        final URLConnection connection)
        throws IOException
    {
        // Make sure the content types match.
        final String contentType = connection.getContentType();
        return CONTENT_TYPE.equals(contentType);
    }

    public Document extractDocument(
        final URLConnection connection)
        throws IOException
    {
        // We are going to read the body from the connection.
        final StringBuffer body = new StringBuffer();
        final BufferedReader reader = new BufferedReader(
            new InputStreamReader(connection.getInputStream()));
        try
        {
            String s = null;
            while ((s = reader.readLine()) != null)
            {
                body.append(s);
                body.append("\n");
            }
        }
        finally
        {
            reader.close();
        }

        // Create the resulting document.
        final DefaultDocument result = new DefaultDocument();

        // Set the meta-data based on the connection.
        result.readMetaData(connection);

        // Set the body that we read in.
        result.setBody(body.toString());

        return result;
    }
}