com.meltmedia.cadmium.search.SearchContentPreprocessor Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2012 meltmedia
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.meltmedia.cadmium.search;
import com.google.inject.Inject;
import com.meltmedia.cadmium.core.meta.ConfigProcessor;
import jodd.jerry.Jerry;
import jodd.lagarto.dom.Node;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Singleton;
import java.io.Closeable;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.Set;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
@Singleton
public class SearchContentPreprocessor implements ConfigProcessor, IndexSearcherProvider, Closeable {
private final Logger log = LoggerFactory.getLogger(getClass());
@Inject(optional=true)
protected Set searchPreprocessors;
public static FileFilter HTML_FILE_FILTER = new FileFilter() {
@Override
public boolean accept(File pathname) {
return pathname.isFile()
&& pathname.getPath().toLowerCase().matches(".*\\.htm[l]?\\Z")
&& !pathname.getName().toLowerCase().matches("^((\\d{3})|(\\d{2}[x])|(\\d[x]{2}))\\.htm[l]?$");
}
};
public static FileFilter DIR_FILTER = new FileFilter() {
@Override
public boolean accept(File pathname) {
return pathname.isDirectory();
}
};
public static FileFilter NOT_INF_DIR_FILTER = new FileFilter() {
@Override
public boolean accept(File pathname) {
return pathname.isDirectory() && !pathname.getName().endsWith("-INF");
}
};
public static Comparator FILE_NAME_COMPARATOR = new Comparator() {
@Override
public int compare(File file1, File file2) {
return file1.getName().compareTo(file2.getName());
}
};
/**
* A template class that scans the content directory, starting at the root, and
* calls scan(File) for every file that matches the provided content filter.
*
* @author Christian Trimble
*/
public static abstract class ContentScanTemplate
{
private FileFilter contentFilter;
public ContentScanTemplate(FileFilter contentFilter) {
this.contentFilter = contentFilter;
}
public void scan( File contentRoot ) throws Exception {
// create the frontier and add the content root.
LinkedList frontier = new LinkedList();
// scan the content root dir for html files.
for( File htmlFile : contentRoot.listFiles(contentFilter)) {
handleFile(htmlFile);
}
// add the non "-INF" directories, in a predictable order.
frontier.subList(0, 0).addAll(Arrays.asList(sort(contentRoot.listFiles(NOT_INF_DIR_FILTER), FILE_NAME_COMPARATOR)));
while( !frontier.isEmpty() ) {
File dir = frontier.removeFirst();
// scan the html files in the directory.
for( File htmlFile : dir.listFiles(contentFilter)) {
handleFile(htmlFile);
}
// add the directories, in a predictable order.
frontier.subList(0, 0).addAll(Arrays.asList(sort(dir.listFiles(DIR_FILTER), FILE_NAME_COMPARATOR)));
}
}
/**
* An call to Arrays.sort(array, comparator) that returns the array argument after the sort.
*
* @param array the array to sort.
* @param comparator the comparator to sort with.
* @return the array argument.
*/
private static T[] sort( T[] array, Comparator comparator ) {
Arrays.sort(array, comparator);
return array;
}
public abstract void handleFile( File file )
throws Exception;
}
private File indexDir;
private File dataDir;
private SearchHolder liveSearch = null;
private SearchHolder stagedSearch = null;
private static Analyzer analyzer = new CadmiumAnalyzer(Version.LUCENE_43);
private final ReentrantReadWriteLock locker = new ReentrantReadWriteLock();
private final ReadLock readLock = locker.readLock();
private final WriteLock writeLock = locker.writeLock();
@Override
public synchronized void processFromDirectory(String metaDir) throws Exception {
SearchHolder newStagedSearcher = new SearchHolder();
indexDir = new File(metaDir, "lucene-index");
dataDir = new File(metaDir).getParentFile();
newStagedSearcher.directory = new NIOFSDirectory(indexDir);
IndexWriter iwriter = null;
try {
iwriter = new IndexWriter(newStagedSearcher.directory, new IndexWriterConfig(Version.LUCENE_43, analyzer).setRAMBufferSizeMB(5));
iwriter.deleteAll();
writeIndex(iwriter, dataDir);
}
finally {
IOUtils.closeQuietly(iwriter);
iwriter = null;
}
newStagedSearcher.indexReader = DirectoryReader.open(newStagedSearcher.directory);
SearchHolder oldStage = stagedSearch;
stagedSearch = newStagedSearcher;
if(oldStage != null) {
oldStage.close();
}
log.info("About to call processSearchPreprocessors()");
processSearchPreprocessors(newStagedSearcher.indexReader, analyzer, "content");
}
void writeIndex( final IndexWriter indexWriter, File contentDir ) throws Exception {
new ContentScanTemplate(HTML_FILE_FILTER) {
private Jerry.JerryParser jerryParser = null;
@Override
public void handleFile(File file) throws Exception {
try {
if(jerryParser == null) {
jerryParser = Jerry.jerry().enableHtmlMode();
jerryParser.getDOMBuilder().setCaseSensitive(false);
jerryParser.getDOMBuilder().setParseSpecialTagsAsCdata(true);
jerryParser.getDOMBuilder().setSelfCloseVoidTags(false);
jerryParser.getDOMBuilder().setConditionalCommentExpression(null);
jerryParser.getDOMBuilder().setEnableConditionalComments(false);
jerryParser.getDOMBuilder().setImpliedEndTags(false);
jerryParser.getDOMBuilder().setIgnoreComments(true);
}
String htmlContent = FileUtils.readFileToString(file, "UTF-8");
Jerry jerry = jerryParser.parse(htmlContent);
// if we should not index this file, move on.
if(!shouldIndex(jerry)) return;
String title = jerry.$("html > head > title").text();
Jerry removals = jerry.$("title,head,script,[cadmium=\"no-index\"]");
if(removals.size() > 0) {
log.debug("Removing {} element[s]", removals.length());
removals.remove();
} else {
log.debug("No elements to remove");
}
String textContent = jerry.$("body").text();
Document doc = new Document();
doc.add(new TextField("title", title, Field.Store.YES));
doc.add(new TextField("content", textContent, Field.Store.YES));
doc.add(new TextField("path", file.getPath().replaceFirst(dataDir.getPath(), ""), Field.Store.YES));
indexWriter.addDocument(doc);
} catch(Throwable t) {
log.warn("Failed to index page ["+file+"]", t);
}
}
}.scan(contentDir);
}
@Override
public synchronized void makeLive() {
log.info("About to call lock on writeLock");
writeLock.lock();
if( this.stagedSearch != null && this.stagedSearch.directory != null && this.stagedSearch.indexReader != null ) {
log.info("About to call makeLiveProcessSearchPreprocessors()");
makeLiveProcessSearchPreprocessors();
SearchHolder oldLive = liveSearch;
liveSearch = stagedSearch;
IOUtils.closeQuietly(oldLive);
stagedSearch = null;
}
writeLock.unlock();
}
public void finalize() {
IOUtils.closeQuietly(liveSearch);
IOUtils.closeQuietly(stagedSearch);
}
@Override
public IndexSearcher startSearch() {
readLock.lock();
if(this.liveSearch != null) {
if(this.liveSearch.indexSearcher == null) {
IndexSearcher searcher = new IndexSearcher(this.liveSearch.indexReader);
this.liveSearch.indexSearcher = searcher;
}
return this.liveSearch.indexSearcher;
}
return null;
}
@Override
public void endSearch() {
readLock.unlock();
}
@Override
public Analyzer getAnalyzer() {
return analyzer;
}
public File getIndexDir() {
return indexDir;
}
public File getDataDir() {
return dataDir;
}
private class SearchHolder implements Closeable {
private Directory directory = null;
private IndexReader indexReader = null;
private IndexSearcher indexSearcher = null;
public void close() {
IOUtils.closeQuietly(indexReader);
IOUtils.closeQuietly(directory);
}
public void finalize() {
close();
}
}
@Override
public void close() throws IOException {
finalize();
}
/**
* Returns true if an html file should be indexed, false otherwise. Currently, this only tests for the existance of a robots meta tag with a
* content value containing "noindex".
*
* @param jerry the Jerry context for the html page to test.
* @return
*/
private static boolean shouldIndex(Jerry jerry) {
Jerry metaTags = jerry.$("html > head > meta");
if(metaTags.get().length > 0) {
for(Node $this : metaTags.get()){
if($this.hasAttribute("name") && "robots".equals($this.getAttribute("name").toLowerCase()) && $this.getAttribute("content") != null) {
String contentValue = $this.getAttribute("content");
if(contentValue == null || contentValue.toLowerCase().contains("noindex")) {
return false;
}
}
}
}
return true;
}
protected void processSearchPreprocessors(IndexReader reader, Analyzer analyzer, String field) {
log.info("processing search preprocessors.");
log.info("preprocessors to process: {}", searchPreprocessors);
if(searchPreprocessors != null) {
for(SearchPreprocessor p : searchPreprocessors) {
try {
log.info("Processing: {}");
p.process(reader, analyzer, field);
}
catch (Exception e) {
log.warn("Problem setting up search suggester preprocessor for field: {}", field);
}
}
}
}
protected void makeLiveProcessSearchPreprocessors() {
log.info("Making live search preprocessors.");
log.info("preprocessors to process: {}", searchPreprocessors);
if(searchPreprocessors != null) {
for(SearchPreprocessor p : searchPreprocessors) {
try {
p.makeLive();
}
catch (Exception e) {
log.warn("Problem making live the search preprocessor");
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy