eu.unicore.uas.metadata.LuceneIndexer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of uas-metadata Show documentation
The newest version!
package eu.unicore.uas.metadata;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.tika.metadata.TikaCoreProperties;

import eu.unicore.uas.util.LogUtil;

/**
 * Indexes the metadata and provides a search interface.
 *
 * 
 * The location of the Lucene index is determined by the config parameter
 * LuceneIndexer.LUCENE_INDEX_DIR. If the directory does not exists it will be created
 * otherwise an index from the directory will be opened and extended.
 * 
 *
 * 
 * In typical use case @see{LuceneMetadaManager} indexed are metadata files 
 * (their content) and not the content of the files.
 * 
 *
 *
 * TODO:
 * check functionality
 * work with corrupt index exception? should we recover?
 * 
 * @author w.noor
 * @author jrybicki
 * @author schuller
 * 
 */
public class LuceneIndexer {

	private static final Logger LOG = LogUtil.getLogger(LogUtil.DATA, LuceneIndexer.class);

	/**
	 * Content key
	 */
	public static final String CONTENT_KEY = "contents";
	//default search key:
	private static final String SEARCH_KEY = CONTENT_KEY;

	public static final String RESOURCE_NAME_KEY = TikaCoreProperties.RESOURCE_NAME_KEY;
	
	private final String dataDirectory;
	private final Directory directory;

	private final QueryParser parser = new QueryParser(SEARCH_KEY, new StandardAnalyzer());

	private final static Mapindexers=new HashMap();

	private final IndexWriter indexWriter;
	
	/**
	 * @param id - the id of the indexer, usually equal to the storage UUID
	 * @param indexDir - the base directory for the indexes
	 */
	public static synchronized LuceneIndexer get(String id, String indexDir){
		LuceneIndexer indexer=indexers.get(id);
		if(indexer==null){
			indexer=new LuceneIndexer(indexDir + id);
			indexers.put(id,indexer);
		}
		return indexer;
	}

	/**
	 * Initializes Indexer with index in given path
	 *
	 * @param indexLocation
	 */
	public LuceneIndexer(String indexLocation) {
		dataDirectory = indexLocation;
		try {
			directory = initalizeDataDirectory();
			indexWriter = initializeIndex();
		} catch (IOException ex) {
			throw new IllegalArgumentException(String.format("Unable to initialize Lucene index in: %s", dataDirectory), ex);
		}
	}


	/**
	 * Adds resource with metadata and contents to the index.
	 * Old metadata for the resource (if exists) will be removed/overwritten.
	 *
	 * TODO: multiple files with the same resourceName?
	 * 
	 * @param resourceName
	 * @param metadata
	 * @param contents
	 * @throws IOException
	 */
	public void createMetadata(String resourceName, Map metadata, String contents) throws IOException {
		Document document = createMetadataDocument(metadata, resourceName, contents);
		indexWriter.deleteDocuments(new Term(RESOURCE_NAME_KEY, resourceName));
		indexWriter.addDocument(document);
	}

	/**
	 * remove metadata for the given resource
	 * @param resourceName
	 * @throws IOException
	 */
	public void removeMetadata(final String resourceName) throws IOException {
		indexWriter.deleteDocuments(new Term(RESOURCE_NAME_KEY, resourceName));
	}

	/**
	 * Update metadata for existing resource
	 *
	 * Adds provided metadata to existing metadata or creates new metadata if
	 * resource is not already indexed.
	 *
	 * @param metadata
	 * @param resourceName
	 * @param contents
	 * @throws IOException
	 */
	public void updateMetadata(String resourceName, Map metadata, String contents) throws IOException {
		Document doc = getDocument(resourceName);
		Map mergeMetadata; // = new HashMap();
		if (doc != null) {
			Map oldMetadata = extractMetadataFromDocument(doc);
			mergeMetadata = LuceneMetadataManager.mergeMetadata(oldMetadata, metadata);
		} else {
			mergeMetadata = metadata;
		}
		createMetadata(resourceName, mergeMetadata, contents);
	}

	/**
	 * Moves the metadata from @code{source} to @code{target}. Target will be overwritten.
	 *
	 * Lucene does not provide any API method to update the existing index. It is updated by getting the
	 * copy of document, making update, delete the old copy and insert the new copy.
	 *
	 *
	 * @param source
	 * @param target
	 * @throws IOException  
	 */
	public void moveMetadata(String source, String target) throws IOException {
		Document doc = getDocument(source);
		if (doc == null || doc.getFields().size()==0) {
			throw new IllegalArgumentException("No metadata indexed for " + source + ": unable to move");
		}
		removeMetadata(source);
		createMetadata(target, extractMetadataFromDocument(doc), doc.get(CONTENT_KEY));
	}

	/**
	 * Simple single attribute search
	 *
	 * @param queryString
	 * @param maximalHits
	 * @return list of search results
	 * @throws IOException
	 */
	public List search(String queryString, int maximalHits) throws IOException {
		IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(directory));
		try {
			List ret = new ArrayList();
			Query query = parser.parse(queryString);
			TopDocs results = searcher.search(query, maximalHits);
			for (ScoreDoc scoreDoc : results.scoreDocs) {
				SearchResult result = new SearchResult();
				result.setResourceName(searcher.storedFields().document(scoreDoc.doc).get(RESOURCE_NAME_KEY));
				ret.add(result);
			}
			return ret;
		} catch (ParseException pe) {
			throw new IOException(pe);
		}
	}

	/**
	 * Advanced multiattribute query
	 * 
	 * Search for @code{numberOfrecords} documents fulfilling the queries
	 *
	 * @param queryStrings
	 * @param numberOfrecords
	 * @return List of aggregated results
	 * @throws IOException  
	 */
	public List search(String[] queryStrings, int numberOfrecords) throws IOException {
		List lstMetadataFiles = new ArrayList();

		for (String queryString : queryStrings) {
			List partialResult = search(queryString, numberOfrecords);
			lstMetadataFiles.addAll(partialResult);
		}

		return lstMetadataFiles;
	}

	public void commit()throws IOException{
		indexWriter.commit();
	}

	/**
	 * Commit changes and optimize index
	 * 
	 * Should be called after adding a lot of documents.
	 * 
	 *
	 * @throws java.io.IOException
	 */
	public void optimizeIndex() throws IOException {
		commit();
	}

	/**
	 * delete the index
	 * @throws IOException
	 */
	public void deleteAll()throws IOException{
		indexWriter.deleteAll();
		indexWriter.commit();
	}

	/**
	 * Returns Lucene document for given resource or @code{null} if the resource
	 * is not in index.
	 * 
	 * @param resourceName
	 * @return Document
	 * @throws IOException  if the search index cannot be initialized
	 */
	protected Document getDocument(final String resourceName) throws IOException {
		IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(directory));
		try {
			Query query = new TermQuery(new Term(RESOURCE_NAME_KEY, resourceName)); 
			//XXX: it might be possible that there are more documents for the resourceName... should we merge?
			TopDocs result = searcher.search(query, 1);
			//XXX: return empty or throw an exception?
			if (result.scoreDocs.length == 0) {
				return null;
			}
			return searcher.storedFields().document(result.scoreDocs[0].doc);
		}finally {
		
		}
	}
	
	private Directory initalizeDataDirectory() throws IOException {
		File chkDir = new File(dataDirectory);
		boolean mkdirs = true;
		if (!chkDir.exists()) {
			mkdirs = chkDir.mkdirs();
		}

		if (!mkdirs || !chkDir.isDirectory()) {
			throw new IOException(String.format("Unable to create/find: %s Lucene index directory", dataDirectory));
		}
		return new NIOFSDirectory(new File(dataDirectory).toPath());
	}

	private IndexWriter initializeIndex() throws IOException {
		int attempts = 0;
		while(attempts < 2) {
			try {
				return do_initializeIndex();
			}catch(org.apache.lucene.index.IndexFormatTooOldException e) {
				LOG.info(String.format("Old / unsupported Lucene index file detected in: %s, cleanup & retry ...", dataDirectory));
				FileUtils.deleteQuietly(new File(dataDirectory));
				attempts++;
			}
		}
		throw new IOException(String.format("Could not create Lucene index in: %s", dataDirectory));
	}
	
	
	private IndexWriter do_initializeIndex() throws IOException {
		
		// Sometimes a server crash may leave the lock file, so check and unlock if necessary
		// This is only called when creating the LuceneIndexer, and indexers are per-storage,
		// so it should be safe to forcibly unlock
	
		IndexWriterConfig cfg = new IndexWriterConfig(new StandardAnalyzer());
		IndexWriter indexWriter=new IndexWriter(directory, cfg);
		//create index files (to avoid errors when searching is started before files are added)
		indexWriter.commit();
		LOG.info(String.format("Lucene index initialized in: %s", dataDirectory));
		return indexWriter;
	}

	/**
	 * Extracts metadata from Lucene.Document
	 *
	 * @param document
	 * @return metadata map
	 */
	protected static Map extractMetadataFromDocument(final Document document) {
		if (document == null) {
			throw new IllegalArgumentException("Document for metadata extraction cannot be null");
		}
		Map ret = new HashMap();
		for (Object object : document.getFields()) {
			Field field = (Field) object;
			String name = field.name();
			String value = document.get(name);
			ret.put(name, value);
		}
		return ret;
	}

	/**
	 * Creates Lucene.Document for provided resource, metadata and content
	 *
	 * @param metadata
	 * @param resource
	 * @param contents
	 * @return Document to be inserted in index.
	 */
	protected static Document createMetadataDocument(Map metadata, String resource, String contents) {
		if (metadata == null || metadata.isEmpty()) {
			throw new IllegalArgumentException("Metadata cannot be null or empty");
		}
		if (resource == null || resource.trim().isEmpty()) {
			throw new IllegalArgumentException("Resource name cannot be null or empty");
		}
		
		Document doc = new Document();
		for (Map.Entry entry : metadata.entrySet()) {
			doc.add(new Field(entry.getKey(), entry.getValue(), TextField.TYPE_STORED));
		}
		
		//it might be already in the metadata: update
		doc.removeField(RESOURCE_NAME_KEY);
		FieldType type = new FieldType();
		type.setTokenized(false);
		type.setStored(true);
		type.setIndexOptions(IndexOptions.DOCS);
		doc.add(new Field(RESOURCE_NAME_KEY, resource, type));

		if (contents != null && !contents.trim().isEmpty()) {
			doc.add(new Field(LuceneIndexer.CONTENT_KEY, contents, TextField.TYPE_NOT_STORED));
		}

		return doc;
	}
}