org.apache.pdfbox.examples.lucene.IndexPDFFiles Maven / Gradle / Ivy
Show all versions of pdfbox-examples Show documentation
package org.apache.pdfbox.examples.lucene;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* Index all pdf files under a directory.
*
* This is a command-line application demonstrating simple Lucene indexing. Run it with no command-line arguments for
* usage information.
*
* It's based on a demo provided by the lucene project.
*/
public final class IndexPDFFiles
{
private IndexPDFFiles()
{
}
/**
* Index all text files under a directory.
*
* @param args command line arguments
*
*/
public static void main(String[] args)
{
String usage = "java org.apache.pdfbox.lucene.IndexPDFFiles"
+ " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
+ "This indexes all PDF documents in DOCS_PATH, creating a Lucene index"
+ "in INDEX_PATH that can be searched with SearchFiles";
String indexPath = "index";
String docsPath = null;
boolean create = true;
for (int i = 0; i < args.length; i++)
{
if ("-index".equals(args[i]))
{
indexPath = args[i + 1];
i++;
}
else if ("-docs".equals(args[i]))
{
docsPath = args[i + 1];
i++;
}
else if ("-update".equals(args[i]))
{
create = false;
}
}
if (docsPath == null)
{
System.err.println("Usage: " + usage);
System.exit(1);
}
final File docDir = new File(docsPath);
if (!docDir.exists() || !docDir.canRead())
{
System.out.println("Document directory '" + docDir.getAbsolutePath()
+ "' does not exist or is not readable, please check the path");
System.exit(1);
}
Date start = new Date();
try
{
System.out.println("Indexing to directory '" + indexPath + "'...");
Directory dir = FSDirectory.open(new File(indexPath));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer);
if (create)
{
// Create a new index in the directory, removing any
// previously indexed documents:
iwc.setOpenMode(OpenMode.CREATE);
}
else
{
// Add new documents to an existing index:
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
// Optional: for better indexing performance, if you
// are indexing many documents, increase the RAM
// buffer. But if you do this, increase the max heap
// size to the JVM (eg add -Xmx512m or -Xmx1g):
//
// iwc.setRAMBufferSizeMB(256.0);
IndexWriter writer = new IndexWriter(dir, iwc);
indexDocs(writer, docDir);
// NOTE: if you want to maximize search performance,
// you can optionally call forceMerge here. This can be
// a terribly costly operation, so generally it's only
// worth it when your index is relatively static (ie
// you're done adding documents to it):
//
// writer.forceMerge(1);
writer.close();
Date end = new Date();
System.out.println(end.getTime() - start.getTime() + " total milliseconds");
}
catch (IOException e)
{
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
}
/**
* Indexes the given file using the given writer, or if a directory is given, recurses over files and directories
* found under the given directory.
*
* NOTE: This method indexes one document per input file. This is slow. For good throughput, put multiple documents
* into your input file(s). An example of this is in the benchmark module, which can create "line doc" files, one
* document per line, using the WriteLineDocTask.
*
* @param writer Writer to the index where the given file/dir info will be stored
* @param file The file to index, or the directory to recurse into to find files to index
* @throws IOException If there is a low-level I/O error
*/
static void indexDocs(IndexWriter writer, File file) throws IOException
{
// do not try to index files that cannot be read
if (file.canRead())
{
if (file.isDirectory())
{
String[] files = file.list();
// an IO error could occur
if (files != null)
{
for (String fileName : files)
{
indexDocs(writer, new File(file, fileName));
}
}
}
else
{
FileInputStream fis;
try
{
fis = new FileInputStream(file);
}
catch (FileNotFoundException fnfe)
{
// at least on windows, some temporary files raise this exception with an "access denied" message
// checking if the file can be read doesn't help
return;
}
try
{
String path = file.getName().toUpperCase();
Document doc = null;
if (path.toLowerCase().endsWith(".pdf"))
{
System.out.println("Indexing PDF document: " + file);
doc = LucenePDFDocument.getDocument(file);
}
else
{
System.out.println("Skipping " + file);
return;
}
if (writer.getConfig().getOpenMode() == OpenMode.CREATE)
{
// New index, so we just add the document (no old document can be there):
System.out.println("adding " + file);
writer.addDocument(doc);
}
else
{
// Existing index (an old copy of this document may have been indexed) so
// we use updateDocument instead to replace the old one matching the exact
// path, if present:
System.out.println("updating " + file);
writer.updateDocument(new Term("uid", LucenePDFDocument.createUID(file)), doc);
}
}
finally
{
fis.close();
}
}
}
}
}