eu.unicore.uas.metadata.LuceneIndexer Maven / Gradle / Ivy
The newest version!
package eu.unicore.uas.metadata;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.tika.metadata.TikaCoreProperties;
import eu.unicore.uas.util.LogUtil;
/**
* Indexes the metadata and provides a search interface.
*
*
* The location of the Lucene index is determined by the config parameter
* LuceneIndexer.LUCENE_INDEX_DIR. If the directory does not exists it will be created
* otherwise an index from the directory will be opened and extended.
*
*
*
* In typical use case @see{LuceneMetadaManager} indexed are metadata files
* (their content) and not the content of the files.
*
*
*
* TODO:
* check functionality
* work with corrupt index exception? should we recover?
*
* @author w.noor
* @author jrybicki
* @author schuller
*
*/
public class LuceneIndexer {
private static final Logger LOG = LogUtil.getLogger(LogUtil.DATA, LuceneIndexer.class);
/**
* Content key
*/
public static final String CONTENT_KEY = "contents";
//default search key:
private static final String SEARCH_KEY = CONTENT_KEY;
public static final String RESOURCE_NAME_KEY = TikaCoreProperties.RESOURCE_NAME_KEY;
private final String dataDirectory;
private final Directory directory;
private final QueryParser parser = new QueryParser(SEARCH_KEY, new StandardAnalyzer());
private final static Mapindexers=new HashMap();
private final IndexWriter indexWriter;
/**
* @param id - the id of the indexer, usually equal to the storage UUID
* @param indexDir - the base directory for the indexes
*/
public static synchronized LuceneIndexer get(String id, String indexDir){
LuceneIndexer indexer=indexers.get(id);
if(indexer==null){
indexer=new LuceneIndexer(indexDir + id);
indexers.put(id,indexer);
}
return indexer;
}
/**
* Initializes Indexer with index in given path
*
* @param indexLocation
*/
public LuceneIndexer(String indexLocation) {
dataDirectory = indexLocation;
try {
directory = initalizeDataDirectory();
indexWriter = initializeIndex();
} catch (IOException ex) {
throw new IllegalArgumentException(String.format("Unable to initialize Lucene index in: %s", dataDirectory), ex);
}
}
/**
* Adds resource with metadata and contents to the index.
* Old metadata for the resource (if exists) will be removed/overwritten.
*
* TODO: multiple files with the same resourceName?
*
* @param resourceName
* @param metadata
* @param contents
* @throws IOException
*/
public void createMetadata(String resourceName, Map metadata, String contents) throws IOException {
Document document = createMetadataDocument(metadata, resourceName, contents);
indexWriter.deleteDocuments(new Term(RESOURCE_NAME_KEY, resourceName));
indexWriter.addDocument(document);
}
/**
* remove metadata for the given resource
* @param resourceName
* @throws IOException
*/
public void removeMetadata(final String resourceName) throws IOException {
indexWriter.deleteDocuments(new Term(RESOURCE_NAME_KEY, resourceName));
}
/**
* Update metadata for existing resource
*
* Adds provided metadata to existing metadata or creates new metadata if
* resource is not already indexed.
*
* @param metadata
* @param resourceName
* @param contents
* @throws IOException
*/
public void updateMetadata(String resourceName, Map metadata, String contents) throws IOException {
Document doc = getDocument(resourceName);
Map mergeMetadata; // = new HashMap();
if (doc != null) {
Map oldMetadata = extractMetadataFromDocument(doc);
mergeMetadata = LuceneMetadataManager.mergeMetadata(oldMetadata, metadata);
} else {
mergeMetadata = metadata;
}
createMetadata(resourceName, mergeMetadata, contents);
}
/**
* Moves the metadata from @code{source} to @code{target}. Target will be overwritten.
*
* Lucene does not provide any API method to update the existing index. It is updated by getting the
* copy of document, making update, delete the old copy and insert the new copy.
*
*
* @param source
* @param target
* @throws IOException
*/
public void moveMetadata(String source, String target) throws IOException {
Document doc = getDocument(source);
if (doc == null || doc.getFields().size()==0) {
throw new IllegalArgumentException("No metadata indexed for " + source + ": unable to move");
}
removeMetadata(source);
createMetadata(target, extractMetadataFromDocument(doc), doc.get(CONTENT_KEY));
}
/**
* Simple single attribute search
*
* @param queryString
* @param maximalHits
* @return list of search results
* @throws IOException
*/
public List search(String queryString, int maximalHits) throws IOException {
IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(directory));
try {
List ret = new ArrayList();
Query query = parser.parse(queryString);
TopDocs results = searcher.search(query, maximalHits);
for (ScoreDoc scoreDoc : results.scoreDocs) {
SearchResult result = new SearchResult();
result.setResourceName(searcher.storedFields().document(scoreDoc.doc).get(RESOURCE_NAME_KEY));
ret.add(result);
}
return ret;
} catch (ParseException pe) {
throw new IOException(pe);
}
}
/**
* Advanced multiattribute query
*
* Search for @code{numberOfrecords} documents fulfilling the queries
*
* @param queryStrings
* @param numberOfrecords
* @return List of aggregated results
* @throws IOException
*/
public List search(String[] queryStrings, int numberOfrecords) throws IOException {
List lstMetadataFiles = new ArrayList();
for (String queryString : queryStrings) {
List partialResult = search(queryString, numberOfrecords);
lstMetadataFiles.addAll(partialResult);
}
return lstMetadataFiles;
}
public void commit()throws IOException{
indexWriter.commit();
}
/**
* Commit changes and optimize index
*
* Should be called after adding a lot of documents.
*
*
* @throws java.io.IOException
*/
public void optimizeIndex() throws IOException {
commit();
}
/**
* delete the index
* @throws IOException
*/
public void deleteAll()throws IOException{
indexWriter.deleteAll();
indexWriter.commit();
}
/**
* Returns Lucene document for given resource or @code{null} if the resource
* is not in index.
*
* @param resourceName
* @return Document
* @throws IOException if the search index cannot be initialized
*/
protected Document getDocument(final String resourceName) throws IOException {
IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(directory));
try {
Query query = new TermQuery(new Term(RESOURCE_NAME_KEY, resourceName));
//XXX: it might be possible that there are more documents for the resourceName... should we merge?
TopDocs result = searcher.search(query, 1);
//XXX: return empty or throw an exception?
if (result.scoreDocs.length == 0) {
return null;
}
return searcher.storedFields().document(result.scoreDocs[0].doc);
}finally {
}
}
private Directory initalizeDataDirectory() throws IOException {
File chkDir = new File(dataDirectory);
boolean mkdirs = true;
if (!chkDir.exists()) {
mkdirs = chkDir.mkdirs();
}
if (!mkdirs || !chkDir.isDirectory()) {
throw new IOException(String.format("Unable to create/find: %s Lucene index directory", dataDirectory));
}
return new NIOFSDirectory(new File(dataDirectory).toPath());
}
private IndexWriter initializeIndex() throws IOException {
int attempts = 0;
while(attempts < 2) {
try {
return do_initializeIndex();
}catch(org.apache.lucene.index.IndexFormatTooOldException e) {
LOG.info(String.format("Old / unsupported Lucene index file detected in: %s, cleanup & retry ...", dataDirectory));
FileUtils.deleteQuietly(new File(dataDirectory));
attempts++;
}
}
throw new IOException(String.format("Could not create Lucene index in: %s", dataDirectory));
}
private IndexWriter do_initializeIndex() throws IOException {
// Sometimes a server crash may leave the lock file, so check and unlock if necessary
// This is only called when creating the LuceneIndexer, and indexers are per-storage,
// so it should be safe to forcibly unlock
IndexWriterConfig cfg = new IndexWriterConfig(new StandardAnalyzer());
IndexWriter indexWriter=new IndexWriter(directory, cfg);
//create index files (to avoid errors when searching is started before files are added)
indexWriter.commit();
LOG.info(String.format("Lucene index initialized in: %s", dataDirectory));
return indexWriter;
}
/**
* Extracts metadata from Lucene.Document
*
* @param document
* @return metadata map
*/
protected static Map extractMetadataFromDocument(final Document document) {
if (document == null) {
throw new IllegalArgumentException("Document for metadata extraction cannot be null");
}
Map ret = new HashMap();
for (Object object : document.getFields()) {
Field field = (Field) object;
String name = field.name();
String value = document.get(name);
ret.put(name, value);
}
return ret;
}
/**
* Creates Lucene.Document for provided resource, metadata and content
*
* @param metadata
* @param resource
* @param contents
* @return Document to be inserted in index.
*/
protected static Document createMetadataDocument(Map metadata, String resource, String contents) {
if (metadata == null || metadata.isEmpty()) {
throw new IllegalArgumentException("Metadata cannot be null or empty");
}
if (resource == null || resource.trim().isEmpty()) {
throw new IllegalArgumentException("Resource name cannot be null or empty");
}
Document doc = new Document();
for (Map.Entry entry : metadata.entrySet()) {
doc.add(new Field(entry.getKey(), entry.getValue(), TextField.TYPE_STORED));
}
//it might be already in the metadata: update
doc.removeField(RESOURCE_NAME_KEY);
FieldType type = new FieldType();
type.setTokenized(false);
type.setStored(true);
type.setIndexOptions(IndexOptions.DOCS);
doc.add(new Field(RESOURCE_NAME_KEY, resource, type));
if (contents != null && !contents.trim().isEmpty()) {
doc.add(new Field(LuceneIndexer.CONTENT_KEY, contents, TextField.TYPE_NOT_STORED));
}
return doc;
}
}