
io.airlift.http.server.search.SearchIndex Maven / Gradle / Ivy
The newest version!
package io.airlift.http.server.search;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import java.io.IOException;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.annotation.Nullable;
import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
import org.elasticsearch.action.delete.DeleteResponse;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.sonatype.goodies.lifecycle.LifecycleSupport;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Charsets;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import io.takari.watcher.DirectoryChangeListener;
import io.takari.watcher.DirectoryWatcher;
/**
* Search index.
*
* @since 0.0.6
*/
public class SearchIndex extends LifecycleSupport {
private static final String PAGE_TYPE = "page";
private static final String F_PATH = "path";
private static final String F_TITLE = "title";
private static final String F_CONTENT = "content";
private static final HashFunction SHA1 = Hashing.sha1();
private final Client client;
private final String indexName;
private final Path watchDirectory;
@Nullable
private DirectoryWatcher directoryWatcher;
/**
* Flag used to inform {@link DirectoryWatcher} via {@link DirectoryChangeListener#stopWatching()}
* to stop if set to {@code false}.
*/
private final AtomicBoolean watching = new AtomicBoolean(false);
private SearchIndex(final Client client, final String indexName, final Path watchDirectory) {
this.client = checkNotNull(client);
this.indexName = checkNotNull(indexName);
this.watchDirectory = checkNotNull(watchDirectory).toAbsolutePath();
}
public Client getClient() {
return client;
}
public String getIndexName() {
return indexName;
}
public Path getWatchDirectory() {
return watchDirectory;
}
@Override
protected void doStart() throws Exception {
// prepare the index
client.admin().indices().prepareCreate(indexName)
.setSource(jsonBuilder()
.startObject()
// TODO: index configuration
.endObject())
.get();
log.info("Index created: {}", indexName);
// first index all content, watcher will only show diffs
SimpleFileVisitor indexVisitor = new SimpleFileVisitor() {
@Override
public FileVisitResult visitFile(final Path file, final BasicFileAttributes attrs) throws IOException {
index(file);
return FileVisitResult.CONTINUE;
}
};
log.info("Scanning directory: {}", watchDirectory);
Files.walkFileTree(watchDirectory, indexVisitor);
directoryWatcher = DirectoryWatcher.builder()
.directory(watchDirectory)
.listener(new DirectoryChangeListener() {
@Override
public void onCreate(final Path file) throws IOException {
index(file);
}
@Override
public void onModify(final Path file) throws IOException {
index(file);
}
@Override
public void onDelete(final Path file) throws IOException {
deindex(file);
}
@Override
public boolean stopWatching() {
return !watching.get();
}
})
.build();
watching.set(true);
new Thread("DirectoryWatcher-" + System.currentTimeMillis()) {
@Override
public void run() {
log.info("Watching directory: {}", watchDirectory);
// watch() could fail if listener callback throws exception, warn and continue
while (watching.get()) {
try {
directoryWatcher.watch();
} catch (Exception e) {
log.warn("Watch failed", e);
}
}
log.info("Stopped watching");
}
}.start();
}
@Override
protected void doStop() throws Exception {
if (directoryWatcher != null) {
watching.set(false);
directoryWatcher.close();
directoryWatcher = null;
}
}
public void refresh() throws ExecutionException, InterruptedException {
client.admin().indices().refresh(new RefreshRequest(indexName)).get();
}
/**
* Return page-path for file.
*/
private String pagePath(final Path file) {
return watchDirectory.relativize(file).toString();
}
/**
* Return unique page-id for file.
*
* This is the page-path base64 encoded.
*/
private String pageId(final Path file) {
String name = pagePath(file);
return SHA1.hashString(name, Charsets.UTF_8).toString();
}
/**
* Returns {@code true} if given file can be indexed.
*
* ATM only html files can be indexed.
*/
private boolean isIndexable(final Path file) {
String fileName = file.toString().toLowerCase(Locale.US);
return fileName.endsWith(".html") || fileName.endsWith(".htm");
}
/**
* Create or update index entry for file.
*/
@VisibleForTesting
void index(final Path file) throws IOException {
checkNotNull(file);
if (!isIndexable(file)) {
log.debug("Ignoring non-indexable file: {}", file);
return;
}
log.info("Indexing: {}", file);
// parse the document to pull out title and strip tags for content indexing
Document document = Jsoup.parse(file.toFile(), "UTF-8");
XContentBuilder record = jsonBuilder()
.startObject()
.field(F_PATH, pagePath(file))
.field(F_TITLE, document.title())
.field(F_CONTENT, document.text())
.endObject();
log.debug("Record: {}", record.string());
IndexResponse response = client.prepareIndex(indexName, PAGE_TYPE, pageId(file))
.setSource(record)
.get();
log.trace("Response: {}", response);
}
/**
* Delete index entry for file.
*/
@VisibleForTesting
void deindex(final Path file) throws IOException {
checkNotNull(file);
log.info("Deindexing: {}", file);
DeleteResponse response = client.prepareDelete(indexName, PAGE_TYPE, pageId(file))
.get();
log.trace("Response: {}", response);
}
/**
* Search indexed pages.
*
* Uses ElasticSearch simple-query-string query.
*/
public List search(final String query) {
checkNotNull(query);
log.info("Search w/query: {}", query);
SearchResponse response = client.prepareSearch(indexName)
.setTypes(PAGE_TYPE)
.setQuery(QueryBuilders.simpleQueryStringQuery(query))
// hard-coding size for now, likely sufficient for needs anyways
.setSize(100)
.get();
log.trace("Response: {}", response);
// gson doesn't seem to like to render Iterables.transform() result so make a list
SearchHits hits = response.getHits();
List results = new ArrayList<>((int)hits.totalHits());
for (SearchHit hit : hits) {
Map source = hit.getSource();
results.add(new SearchResult(
(String) source.get(F_PATH),
(String) source.get(F_TITLE),
hit.score()));
}
return results;
}
/**
* Search index builder.
*/
public static class Builder {
private Client client;
private String indexName;
private Path watchDirectory;
public Builder client(final Client client) {
this.client = client;
return this;
}
public Builder indexName(final String indexName) {
this.indexName = indexName;
return this;
}
public Builder watchDirectory(final Path directory) {
this.watchDirectory = directory;
return this;
}
public SearchIndex build() {
checkState(client != null, "Missing: client");
checkState(indexName != null, "Missing: indexName");
checkState(watchDirectory != null, "Missing: watchDirectory");
return new SearchIndex(client, indexName, watchDirectory);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy