io.airlift.http.server.search.SearchIndex Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of webserver Show documentation
The newest version!
package io.airlift.http.server.search;

import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;

import java.io.IOException;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.atomic.AtomicBoolean;

import javax.annotation.Nullable;

import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
import org.elasticsearch.action.delete.DeleteResponse;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.sonatype.goodies.lifecycle.LifecycleSupport;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Charsets;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;

import io.takari.watcher.DirectoryChangeListener;
import io.takari.watcher.DirectoryWatcher;

/**
 * Search index.
 *
 * @since 0.0.6
 */
public class SearchIndex extends LifecycleSupport {
  private static final String PAGE_TYPE = "page";

  private static final String F_PATH = "path";

  private static final String F_TITLE = "title";

  private static final String F_CONTENT = "content";

  private static final HashFunction SHA1 = Hashing.sha1();

  private final Client client;

  private final String indexName;

  private final Path watchDirectory;

  @Nullable
  private DirectoryWatcher directoryWatcher;

  /**
   * Flag used to inform {@link DirectoryWatcher} via {@link DirectoryChangeListener#stopWatching()}
   * to stop if set to {@code false}.
   */
  private final AtomicBoolean watching = new AtomicBoolean(false);

  private SearchIndex(final Client client, final String indexName, final Path watchDirectory) {
    this.client = checkNotNull(client);
    this.indexName = checkNotNull(indexName);
    this.watchDirectory = checkNotNull(watchDirectory).toAbsolutePath();
  }

  public Client getClient() {
    return client;
  }

  public String getIndexName() {
    return indexName;
  }

  public Path getWatchDirectory() {
    return watchDirectory;
  }

  @Override
  protected void doStart() throws Exception {
    // prepare the index
    client.admin().indices().prepareCreate(indexName)
      .setSource(jsonBuilder()
        .startObject()
        // TODO: index configuration
        .endObject())
      .get();
    log.info("Index created: {}", indexName);

    // first index all content, watcher will only show diffs
    SimpleFileVisitor indexVisitor = new SimpleFileVisitor() {
      @Override
      public FileVisitResult visitFile(final Path file, final BasicFileAttributes attrs) throws IOException {
        index(file);
        return FileVisitResult.CONTINUE;
      }
    };

    log.info("Scanning directory: {}", watchDirectory);
    Files.walkFileTree(watchDirectory, indexVisitor);

    directoryWatcher = DirectoryWatcher.builder()
      .directory(watchDirectory)
      .listener(new DirectoryChangeListener() {
        @Override
        public void onCreate(final Path file) throws IOException {
          index(file);
        }

        @Override
        public void onModify(final Path file) throws IOException {
          index(file);
        }

        @Override
        public void onDelete(final Path file) throws IOException {
          deindex(file);
        }

        @Override
        public boolean stopWatching() {
          return !watching.get();
        }
      })
      .build();

    watching.set(true);
    new Thread("DirectoryWatcher-" + System.currentTimeMillis()) {
      @Override
      public void run() {
        log.info("Watching directory: {}", watchDirectory);

        // watch() could fail if listener callback throws exception, warn and continue
        while (watching.get()) {
          try {
            directoryWatcher.watch();
          } catch (Exception e) {
            log.warn("Watch failed", e);
          }
        }

        log.info("Stopped watching");
      }
    }.start();
  }

  @Override
  protected void doStop() throws Exception {
    if (directoryWatcher != null) {
      watching.set(false);
      directoryWatcher.close();
      directoryWatcher = null;
    }
  }

  public void refresh() throws ExecutionException, InterruptedException {
    client.admin().indices().refresh(new RefreshRequest(indexName)).get();
  }

  /**
   * Return page-path for file.
   */
  private String pagePath(final Path file) {
    return watchDirectory.relativize(file).toString();
  }

  /**
   * Return unique page-id for file.
   *
   * This is the page-path base64 encoded.
   */
  private String pageId(final Path file) {
    String name = pagePath(file);
    return SHA1.hashString(name, Charsets.UTF_8).toString();
  }

  /**
   * Returns {@code true} if given file can be indexed.
   *
   * ATM only html files can be indexed.
   */
  private boolean isIndexable(final Path file) {
    String fileName = file.toString().toLowerCase(Locale.US);
    return fileName.endsWith(".html") || fileName.endsWith(".htm");
  }

  /**
   * Create or update index entry for file.
   */
  @VisibleForTesting
  void index(final Path file) throws IOException {
    checkNotNull(file);

    if (!isIndexable(file)) {
      log.debug("Ignoring non-indexable file: {}", file);
      return;
    }

    log.info("Indexing: {}", file);

    // parse the document to pull out title and strip tags for content indexing
    Document document = Jsoup.parse(file.toFile(), "UTF-8");

    XContentBuilder record = jsonBuilder()
      .startObject()
      .field(F_PATH, pagePath(file))
      .field(F_TITLE, document.title())
      .field(F_CONTENT, document.text())
      .endObject();

    log.debug("Record: {}", record.string());

    IndexResponse response = client.prepareIndex(indexName, PAGE_TYPE, pageId(file))
      .setSource(record)
      .get();

    log.trace("Response: {}", response);
  }

  /**
   * Delete index entry for file.
   */
  @VisibleForTesting
  void deindex(final Path file) throws IOException {
    checkNotNull(file);

    log.info("Deindexing: {}", file);

    DeleteResponse response = client.prepareDelete(indexName, PAGE_TYPE, pageId(file))
      .get();

    log.trace("Response: {}", response);
  }

  /**
   * Search indexed pages.
   *
   * Uses ElasticSearch simple-query-string query.
   */
  public List search(final String query) {
    checkNotNull(query);

    log.info("Search w/query: {}", query);

    SearchResponse response = client.prepareSearch(indexName)
      .setTypes(PAGE_TYPE)
      .setQuery(QueryBuilders.simpleQueryStringQuery(query))
      // hard-coding size for now, likely sufficient for needs anyways
      .setSize(100)
      .get();

    log.trace("Response: {}", response);

    // gson doesn't seem to like to render Iterables.transform() result so make a list
    SearchHits hits = response.getHits();
    List results = new ArrayList<>((int)hits.totalHits());
    for (SearchHit hit : hits) {
      Map source = hit.getSource();
      results.add(new SearchResult(
        (String) source.get(F_PATH),
        (String) source.get(F_TITLE),
        hit.score()));
    }

    return results;
  }

  /**
   * Search index builder.
   */
  public static class Builder {
    private Client client;

    private String indexName;

    private Path watchDirectory;

    public Builder client(final Client client) {
      this.client = client;
      return this;
    }

    public Builder indexName(final String indexName) {
      this.indexName = indexName;
      return this;
    }

    public Builder watchDirectory(final Path directory) {
      this.watchDirectory = directory;
      return this;
    }

    public SearchIndex build() {
      checkState(client != null, "Missing: client");
      checkState(indexName != null, "Missing: indexName");
      checkState(watchDirectory != null, "Missing: watchDirectory");

      return new SearchIndex(client, indexName, watchDirectory);
    }
  }
}