org.apache.pdfbox.examples.lucene.LucenePDFDocument Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdfbox-examples Show documentation
The Apache PDFBox library is an open source Java tool for working with PDF documents. This artefact contains examples on how the library can be used.
There is a newer version: 3.0.3
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.examples.lucene;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.Calendar;
import java.util.Date;

import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.text.PDFTextStripper;

/**
 * This class is used to create a document for the lucene search engine. This should easily plug into the IndexPDFFiles
 * that comes with the lucene project. This class will populate the following fields.
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * Lucene Field Name Description
path File system path if loaded from a file
url URL to PDF document
contents Entire contents of PDF document, indexed but not stored
summary First 500 characters of content
modified The modified date/time according to the url or path
uid A unique identifier for the Lucene document.
CreationDate From PDF meta-data if available
Creator From PDF meta-data if available
Keywords From PDF meta-data if available
ModificationDate From PDF meta-data if available
Producer From PDF meta-data if available
Subject From PDF meta-data if available
Trapped From PDF meta-data if available
 * 
 * @author Ben Litchfield
 * 
 */
public class LucenePDFDocument
{
    private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0);

    // given caveat of increased search times when using
    // MICROSECOND, only use SECOND by default
    private static final DateTools.Resolution DATE_TIME_RES = DateTools.Resolution.SECOND;

    private PDFTextStripper stripper = null;

    /** not Indexed, tokenized, stored. */
    public static final FieldType TYPE_STORED_NOT_INDEXED = new FieldType();

    static
    {
        TYPE_STORED_NOT_INDEXED.setIndexed(false);
        TYPE_STORED_NOT_INDEXED.setStored(true);
        TYPE_STORED_NOT_INDEXED.setTokenized(true);
        TYPE_STORED_NOT_INDEXED.freeze();
    }

    /**
     * Constructor.
     */
    public LucenePDFDocument()
    {
    }

    /**
     * Set the text stripper that will be used during extraction.
     * 
     * @param aStripper The new pdf text stripper.
     */
    public void setTextStripper(PDFTextStripper aStripper)
    {
        stripper = aStripper;
    }

    private static String timeToString(long time)
    {
        return DateTools.timeToString(time, DATE_TIME_RES);
    }

    private void addKeywordField(Document document, String name, String value)
    {
        if (value != null)
        {
            document.add(new StringField(name, value, Field.Store.YES));
        }
    }

    private void addTextField(Document document, String name, Reader value)
    {
        if (value != null)
        {
            document.add(new TextField(name, value));
        }
    }

    private void addTextField(Document document, String name, String value)
    {
        if (value != null)
        {
            document.add(new TextField(name, value, Field.Store.YES));
        }
    }

    private void addTextField(Document document, String name, Date value)
    {
        if (value != null)
        {
            addTextField(document, name, DateTools.dateToString(value, DATE_TIME_RES));
        }
    }

    private void addTextField(Document document, String name, Calendar value)
    {
        if (value != null)
        {
            addTextField(document, name, value.getTime());
        }
    }

    private static void addUnindexedField(Document document, String name, String value)
    {
        if (value != null)
        {
            document.add(new Field(name, value, TYPE_STORED_NOT_INDEXED));
        }
    }

    private void addUnstoredKeywordField(Document document, String name, String value)
    {
        if (value != null)
        {
            document.add(new Field(name, value, TextField.TYPE_NOT_STORED));
        }
    }

    /**
     * Convert the PDF stream to a lucene document.
     * 
     * @param is The input stream.
     * @return The input stream converted to a lucene document.
     * @throws IOException If there is an error converting the PDF.
     */
    public Document convertDocument(InputStream is) throws IOException
    {
        Document document = new Document();
        addContent(document, is, "");
        return document;

    }

    /**
     * This will take a reference to a PDF document and create a lucene document.
     * 
     * @param file A reference to a PDF document.
     * @return The converted lucene document.
     * 
     * @throws IOException If there is an exception while converting the document.
     */
    public Document convertDocument(File file) throws IOException
    {
        Document document = new Document();

        // Add the url as a field named "url". Use an UnIndexed field, so
        // that the url is just stored with the document, but is not searchable.
        addUnindexedField(document, "path", file.getPath());
        addUnindexedField(document, "url", file.getPath().replace(FILE_SEPARATOR, '/'));

        // Add the last modified date of the file a field named "modified". Use a
        // Keyword field, so that it's searchable, but so that no attempt is made
        // to tokenize the field into words.
        addKeywordField(document, "modified", timeToString(file.lastModified()));

        String uid = createUID(file);

        // Add the uid as a field, so that index can be incrementally maintained.
        // This field is not stored with document, it is indexed, but it is not
        // tokenized prior to indexing.
        addUnstoredKeywordField(document, "uid", uid);

        FileInputStream input = null;
        try
        {
            input = new FileInputStream(file);
            addContent(document, input, file.getPath());
        }
        finally
        {
            if (input != null)
            {
                input.close();
            }
        }

        // return the document

        return document;
    }

    /**
     * Convert the document from a PDF to a lucene document.
     * 
     * @param url A url to a PDF document.
     * @return The PDF converted to a lucene document.
     * @throws IOException If there is an error while converting the document.
     */
    public Document convertDocument(URL url) throws IOException
    {
        Document document = new Document();
        URLConnection connection = url.openConnection();
        connection.connect();
        // Add the url as a field named "url". Use an UnIndexed field, so
        // that the url is just stored with the document, but is not searchable.
        addUnindexedField(document, "url", url.toExternalForm());

        // Add the last modified date of the file a field named "modified". Use a
        // Keyword field, so that it's searchable, but so that no attempt is made
        // to tokenize the field into words.
        addKeywordField(document, "modified", timeToString(connection.getLastModified()));

        String uid = createUID(url, connection.getLastModified());

        // Add the uid as a field, so that index can be incrementally maintained.
        // This field is not stored with document, it is indexed, but it is not
        // tokenized prior to indexing.
        addUnstoredKeywordField(document, "uid", uid);

        InputStream input = null;
        try
        {
            input = connection.getInputStream();
            addContent(document, input, url.toExternalForm());
        }
        finally
        {
            if (input != null)
            {
                input.close();
            }
        }

        // return the document
        return document;
    }

    /**
     * This will get a lucene document from a PDF file.
     * 
     * @param is The stream to read the PDF from.
     * 
     * @return The lucene document.
     * 
     * @throws IOException If there is an error parsing or indexing the document.
     */
    public static Document getDocument(InputStream is) throws IOException
    {
        LucenePDFDocument converter = new LucenePDFDocument();
        return converter.convertDocument(is);
    }

    /**
     * This will get a lucene document from a PDF file.
     * 
     * @param file The file to get the document for.
     * 
     * @return The lucene document.
     * 
     * @throws IOException If there is an error parsing or indexing the document.
     */
    public static Document getDocument(File file) throws IOException
    {
        LucenePDFDocument converter = new LucenePDFDocument();
        return converter.convertDocument(file);
    }

    /**
     * This will get a lucene document from a PDF file.
     * 
     * @param url The file to get the document for.
     * 
     * @return The lucene document.
     * 
     * @throws IOException If there is an error parsing or indexing the document.
     */
    public static Document getDocument(URL url) throws IOException
    {
        LucenePDFDocument converter = new LucenePDFDocument();
        return converter.convertDocument(url);
    }

    /**
     * This will add the contents to the lucene document.
     * 
     * @param document The document to add the contents to.
     * @param is The stream to get the contents from.
     * @param documentLocation The location of the document, used just for debug messages.
     * 
     * @throws IOException If there is an error parsing the document.
     */
    private void addContent(Document document, InputStream is, String documentLocation) throws IOException
    {
        PDDocument pdfDocument = null;
        try
        {
            pdfDocument = PDDocument.load(is, "");

            // create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            if (stripper == null)
            {
                stripper = new PDFTextStripper();
            }
            stripper.writeText(pdfDocument, writer);

            // Note: the buffer to string operation is costless;
            // the char array value of the writer buffer and the content string
            // is shared as long as the buffer content is not modified, which will
            // not occur here.
            String contents = writer.getBuffer().toString();

            StringReader reader = new StringReader(contents);

            // Add the tag-stripped contents as a Reader-valued Text field so it will
            // get tokenized and indexed.
            addTextField(document, "contents", reader);

            PDDocumentInformation info = pdfDocument.getDocumentInformation();
            if (info != null)
            {
                addTextField(document, "Author", info.getAuthor());
                addTextField(document, "CreationDate", info.getCreationDate());
                addTextField(document, "Creator", info.getCreator());
                addTextField(document, "Keywords", info.getKeywords());
                addTextField(document, "ModificationDate", info.getModificationDate());
                addTextField(document, "Producer", info.getProducer());
                addTextField(document, "Subject", info.getSubject());
                addTextField(document, "Title", info.getTitle());
                addTextField(document, "Trapped", info.getTrapped());
            }
            int summarySize = Math.min(contents.length(), 500);
            String summary = contents.substring(0, summarySize);
            // Add the summary as an UnIndexed field, so that it is stored and returned
            // with hit documents for display.
            addUnindexedField(document, "summary", summary);
        }
        catch (InvalidPasswordException e)
        {
            // they didn't suppply a password and the default of "" was wrong.
            throw new IOException("Error: The document(" + documentLocation + ") is encrypted and will not be indexed.", e);
        }
        finally
        {
            if (pdfDocument != null)
            {
                pdfDocument.close();
            }
        }
    }

    /**
     * Create an UID for the given file using the given time.
     * 
     * @param url the file we have to create an UID for
     * @param time the time to used to the UID
     * 
     * @return the created UID
     */
    public static String createUID(URL url, long time)
    {
        return url.toExternalForm().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + timeToString(time);
    }

    /**
     * Create an UID for the given file.
     * 
     * @param file the file we have to create an UID for
     * 
     * @return the created UID
     */
    public static String createUID(File file)
    {
        return file.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + timeToString(file.lastModified());
    }
}
Lucene Field Name	Description
path	File system path if loaded from a file
url	URL to PDF document
contents	Entire contents of PDF document, indexed but not stored
summary	First 500 characters of content
modified	The modified date/time according to the url or path
uid	A unique identifier for the Lucene document.
CreationDate	From PDF meta-data if available
Creator	From PDF meta-data if available
Keywords	From PDF meta-data if available
ModificationDate	From PDF meta-data if available
Producer	From PDF meta-data if available
Subject	From PDF meta-data if available
Trapped	From PDF meta-data if available