All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.meltmedia.cadmium.search.SearchContentPreprocessor Maven / Gradle / Ivy

The newest version!
/**
 *    Copyright 2012 meltmedia
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */
package com.meltmedia.cadmium.search;

import com.google.inject.Inject;
import com.meltmedia.cadmium.core.meta.ConfigProcessor;
import jodd.jerry.Jerry;
import jodd.lagarto.dom.Node;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.inject.Singleton;
import java.io.Closeable;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.Set;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;

@Singleton
public class SearchContentPreprocessor  implements ConfigProcessor, IndexSearcherProvider, Closeable {
  private final Logger log = LoggerFactory.getLogger(getClass());
  
  @Inject(optional=true)
  protected Set searchPreprocessors;
  
  
  public static FileFilter HTML_FILE_FILTER = new FileFilter() {
    @Override
    public boolean accept(File pathname) {
      return pathname.isFile() 
          && pathname.getPath().toLowerCase().matches(".*\\.htm[l]?\\Z") 
          && !pathname.getName().toLowerCase().matches("^((\\d{3})|(\\d{2}[x])|(\\d[x]{2}))\\.htm[l]?$");
    }
  };
  
  public static FileFilter DIR_FILTER = new FileFilter() {
    @Override
    public boolean accept(File pathname) {
      return pathname.isDirectory();
    }
  };
  
  public static FileFilter NOT_INF_DIR_FILTER = new FileFilter() {
    @Override
    public boolean accept(File pathname) {
      return pathname.isDirectory() && !pathname.getName().endsWith("-INF");
    }
  };
  
  public static Comparator FILE_NAME_COMPARATOR = new Comparator() {
    @Override
    public int compare(File file1, File file2) {
      return file1.getName().compareTo(file2.getName());
    }
  };
  
  /**
   * A template class that scans the content directory, starting at the root, and
   * calls scan(File) for every file that matches the provided content filter.
   * 
   * @author Christian Trimble
   */
  public static abstract class ContentScanTemplate
  {
    private FileFilter contentFilter;

    public ContentScanTemplate(FileFilter contentFilter) {
      this.contentFilter = contentFilter;
    }
    
    public void scan( File contentRoot ) throws Exception {
      // create the frontier and add the content root.
      LinkedList frontier = new LinkedList();
      
      // scan the content root dir for html files.
      for( File htmlFile : contentRoot.listFiles(contentFilter)) {
        handleFile(htmlFile);
      }
      
      // add the non "-INF" directories, in a predictable order.
      frontier.subList(0, 0).addAll(Arrays.asList(sort(contentRoot.listFiles(NOT_INF_DIR_FILTER), FILE_NAME_COMPARATOR)));
      
      while( !frontier.isEmpty() ) {
        File dir = frontier.removeFirst();
        
        // scan the html files in the directory.
        for( File htmlFile : dir.listFiles(contentFilter)) {
          handleFile(htmlFile);
        }
   
        // add the directories, in a predictable order.
        frontier.subList(0, 0).addAll(Arrays.asList(sort(dir.listFiles(DIR_FILTER), FILE_NAME_COMPARATOR)));
      }
    }
    
    /**
     * An call to Arrays.sort(array, comparator) that returns the array argument after the sort.
     * 
     * @param array the array to sort.
     * @param comparator the comparator to sort with.
     * @return the array argument.
     */
    private static  T[] sort( T[] array, Comparator comparator ) {
      Arrays.sort(array, comparator);
      return array;
    }
    
    public abstract void handleFile( File file )
      throws Exception;
  }
  
  private File indexDir;
  private File dataDir;
  private SearchHolder liveSearch = null;
  private SearchHolder stagedSearch = null;
  private static Analyzer analyzer = new CadmiumAnalyzer(Version.LUCENE_43);
  private final ReentrantReadWriteLock locker = new ReentrantReadWriteLock();
  private final ReadLock readLock = locker.readLock();
  private final WriteLock writeLock = locker.writeLock();
  

  @Override
  public synchronized void processFromDirectory(String metaDir) throws Exception {
    SearchHolder newStagedSearcher = new SearchHolder();
    indexDir = new File(metaDir, "lucene-index");
    dataDir = new File(metaDir).getParentFile();
    newStagedSearcher.directory = new NIOFSDirectory(indexDir);
    IndexWriter iwriter = null;
    try {
      iwriter = new IndexWriter(newStagedSearcher.directory, new IndexWriterConfig(Version.LUCENE_43, analyzer).setRAMBufferSizeMB(5));
      iwriter.deleteAll();
      writeIndex(iwriter, dataDir);
    }
    finally {
      IOUtils.closeQuietly(iwriter);
      iwriter = null;
    }
    newStagedSearcher.indexReader = DirectoryReader.open(newStagedSearcher.directory);
    SearchHolder oldStage = stagedSearch;
    stagedSearch = newStagedSearcher;
    if(oldStage != null) {
      oldStage.close();
    }
    log.info("About to call processSearchPreprocessors()");
    processSearchPreprocessors(newStagedSearcher.indexReader, analyzer, "content");
  }
  
  void writeIndex( final IndexWriter indexWriter, File contentDir ) throws Exception {
    new ContentScanTemplate(HTML_FILE_FILTER) {

      private Jerry.JerryParser jerryParser = null;

      @Override
      public void handleFile(File file) throws Exception {
        try {
          if(jerryParser == null) {
            jerryParser = Jerry.jerry().enableHtmlMode();
            jerryParser.getDOMBuilder().setCaseSensitive(false);
            jerryParser.getDOMBuilder().setParseSpecialTagsAsCdata(true);
            jerryParser.getDOMBuilder().setSelfCloseVoidTags(false);
            jerryParser.getDOMBuilder().setConditionalCommentExpression(null);
            jerryParser.getDOMBuilder().setEnableConditionalComments(false);
            jerryParser.getDOMBuilder().setImpliedEndTags(false);
            jerryParser.getDOMBuilder().setIgnoreComments(true);
          }
          String htmlContent = FileUtils.readFileToString(file, "UTF-8");
          Jerry jerry = jerryParser.parse(htmlContent);
          
          // if we should not index this file, move on.
          if(!shouldIndex(jerry)) return;

          String title = jerry.$("html > head > title").text();

          Jerry removals = jerry.$("title,head,script,[cadmium=\"no-index\"]");
          if(removals.size() > 0) {
          	log.debug("Removing {} element[s]", removals.length());
          	removals.remove();
          } else {
            log.debug("No elements to remove");
          }

          String textContent = jerry.$("body").text();

  	      Document doc = new Document();
  	      doc.add(new TextField("title", title, Field.Store.YES));
  	      doc.add(new TextField("content", textContent, Field.Store.YES));
  	      doc.add(new TextField("path", file.getPath().replaceFirst(dataDir.getPath(), ""), Field.Store.YES));
  	      indexWriter.addDocument(doc);
        } catch(Throwable t) {
          log.warn("Failed to index page ["+file+"]", t);
        }

      }
    }.scan(contentDir); 
    
  }

  @Override
  public synchronized void makeLive() {
  	log.info("About to call lock on writeLock");
    writeLock.lock();
    if( this.stagedSearch != null && this.stagedSearch.directory != null && this.stagedSearch.indexReader != null ) {
    	log.info("About to call makeLiveProcessSearchPreprocessors()");
    	makeLiveProcessSearchPreprocessors();
    	SearchHolder oldLive = liveSearch;
      liveSearch = stagedSearch;
      IOUtils.closeQuietly(oldLive);
      stagedSearch = null;
    }
    writeLock.unlock();
  }
  
  public void finalize() {
    IOUtils.closeQuietly(liveSearch);
    IOUtils.closeQuietly(stagedSearch);
 }

  @Override
  public IndexSearcher startSearch() {
    readLock.lock();
    if(this.liveSearch != null) {
      if(this.liveSearch.indexSearcher == null) {
        IndexSearcher searcher = new IndexSearcher(this.liveSearch.indexReader);
        this.liveSearch.indexSearcher = searcher;
      }
      return this.liveSearch.indexSearcher;
    }
    return null;
  }

  @Override
  public void endSearch() {
    readLock.unlock();
  }

  @Override
  public Analyzer getAnalyzer() {
    return analyzer;
  }
  
  public File getIndexDir() {
    return indexDir;
  }

  public File getDataDir() {
    return dataDir;
  }

  private class SearchHolder implements Closeable {
    private Directory directory = null;
    private IndexReader indexReader = null;
    private IndexSearcher indexSearcher = null;
    public void close() {
      IOUtils.closeQuietly(indexReader);
      IOUtils.closeQuietly(directory);
    }
    public void finalize() {
      close();
    }
  }

  @Override
  public void close() throws IOException {
    finalize();
  }
  
  /**
   * Returns true if an html file should be indexed, false otherwise.  Currently, this only tests for the existance of a robots meta tag with a
   * content value containing "noindex".
   * 
   * @param jerry the Jerry context for the html page to test.
   * @return
   */
  private static boolean shouldIndex(Jerry jerry) {
    Jerry metaTags = jerry.$("html > head > meta");
    if(metaTags.get().length > 0) {
      for(Node $this : metaTags.get()){
          if($this.hasAttribute("name") && "robots".equals($this.getAttribute("name").toLowerCase()) && $this.getAttribute("content") != null) {
            String contentValue = $this.getAttribute("content");
            if(contentValue == null || contentValue.toLowerCase().contains("noindex")) {
              return false;
            }
          }
      }
    }
    return true;
  }
  
  protected void processSearchPreprocessors(IndexReader reader, Analyzer analyzer, String field) {
  	
  	log.info("processing search preprocessors.");
  	log.info("preprocessors to process: {}", searchPreprocessors);
  	if(searchPreprocessors != null) {  		
  		for(SearchPreprocessor p : searchPreprocessors) {  			
  			try {
  				log.info("Processing: {}");
					p.process(reader, analyzer, field);
				} 
  			catch (Exception e) {
					
  				log.warn("Problem setting up search suggester preprocessor for field: {}", field);
				}
  		}
  	}
  }
  
  protected void makeLiveProcessSearchPreprocessors() {
  	  	
  	log.info("Making live search preprocessors.");
  	log.info("preprocessors to process: {}", searchPreprocessors);
  	if(searchPreprocessors != null) {  		
  		for(SearchPreprocessor p : searchPreprocessors) {  			
  			try {
					p.makeLive();
				} 
  			catch (Exception e) {
					
  				log.warn("Problem making live the search preprocessor");
				}
  		}
  	}
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy