gov.sandia.cognition.text.document.extractor.TextDocumentExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cognitive-foundry Show documentation
Show all versions of cognitive-foundry Show documentation
A single jar with all the Cognitive Foundry components.
/*
* File: TextDocumentExtractor.java
* Authors: Justin Basilico
* Company: Sandia National Laboratories
* Project: Cognitive Foundry
*
* Copyright February 23, 2009, Sandia Corporation.
* Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
* license for use of this work by or on behalf of the U.S. Government. Export
* of this program may require a license from the United States Government.
* See CopyrightHistory.txt for complete details.
*
*/
package gov.sandia.cognition.text.document.extractor;
import gov.sandia.cognition.io.FileUtil;
import gov.sandia.cognition.text.document.DefaultDocument;
import gov.sandia.cognition.text.document.Document;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URLConnection;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Extracts text from plain text documents.
*
* @author Justin Basilico
* @since 3.0
*/
public class TextDocumentExtractor
extends AbstractSingleDocumentExtractor
{
/** The content type is {@value}. */
public static final String CONTENT_TYPE = "text/plain";
/** The default set of file extensions for text files. */
public static final List DEFAULT_TEXT_FILE_EXTENSIONS =
Collections.unmodifiableList(Arrays.asList(new String[] { "txt", "text" }));
/**
* Creates a new {@code TextDocumentExtractor}.
*/
public TextDocumentExtractor()
{
super();
}
public boolean canExtract(
final URI uri)
throws IOException
{
// Get the extension of the file.
final String fileName = uri.getPath();
final String fileExtension = FileUtil.getExtension(fileName);
// Match the extension to our list of known extensions.
if (fileExtension != null)
{
final String fileExtensionLowerCase = fileExtension.toLowerCase();
return DEFAULT_TEXT_FILE_EXTENSIONS.contains(fileExtensionLowerCase);
}
return false;
}
public boolean canExtract(
final URLConnection connection)
throws IOException
{
// Make sure the content types match.
final String contentType = connection.getContentType();
return CONTENT_TYPE.equals(contentType);
}
public Document extractDocument(
final URLConnection connection)
throws IOException
{
// We are going to read the body from the connection.
final StringBuffer body = new StringBuffer();
final BufferedReader reader = new BufferedReader(
new InputStreamReader(connection.getInputStream()));
try
{
String s = null;
while ((s = reader.readLine()) != null)
{
body.append(s);
body.append("\n");
}
}
finally
{
reader.close();
}
// Create the resulting document.
final DefaultDocument result = new DefaultDocument();
// Set the meta-data based on the connection.
result.readMetaData(connection);
// Set the body that we read in.
result.setBody(body.toString());
return result;
}
}