org.h2.fulltext.FullTextLucene Maven / Gradle / Ivy
/*
* Copyright 2004-2014 H2 Group. Multiple-Licensed under the MPL 2.0,
* and the EPL 1.0 (http://h2database.com/html/license.html).
* Initial Developer: H2 Group
*/
package org.h2.fulltext;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DatabaseMetaData;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.h2.api.Trigger;
import org.h2.command.Parser;
import org.h2.engine.Session;
import org.h2.expression.ExpressionColumn;
import org.h2.jdbc.JdbcConnection;
import org.h2.store.fs.FileUtils;
import org.h2.tools.SimpleResultSet;
import org.h2.util.New;
import org.h2.util.StatementBuilder;
import org.h2.util.StringUtils;
import org.h2.util.Utils;
import java.io.File;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.index.IndexWriter;
/**
* This class implements the full text search based on Apache Lucene.
* Most methods can be called using SQL statements as well.
*/
public class FullTextLucene extends FullText {
/**
* Whether the text content should be stored in the Lucene index.
*/
protected static final boolean STORE_DOCUMENT_TEXT_IN_INDEX =
Utils.getProperty("h2.storeDocumentTextInIndex", false);
private static final HashMap INDEX_ACCESS = New.hashMap();
private static final String TRIGGER_PREFIX = "FTL_";
private static final String SCHEMA = "FTL";
private static final String LUCENE_FIELD_DATA = "_DATA";
private static final String LUCENE_FIELD_QUERY = "_QUERY";
private static final String LUCENE_FIELD_MODIFIED = "_modified";
private static final String LUCENE_FIELD_COLUMN_PREFIX = "_";
/**
* The prefix for a in-memory path. This prefix is only used internally
* within this class and not related to the database URL.
*/
private static final String IN_MEMORY_PREFIX = "mem:";
/**
* Initializes full text search functionality for this database. This adds
* the following Java functions to the database:
*
* - FTL_CREATE_INDEX(schemaNameString, tableNameString,
* columnListString)
* - FTL_SEARCH(queryString, limitInt, offsetInt): result set
* - FTL_REINDEX()
* - FTL_DROP_ALL()
*
* It also adds a schema FTL to the database where bookkeeping information
* is stored. This function may be called from a Java application, or by
* using the SQL statements:
*
*
* CREATE ALIAS IF NOT EXISTS FTL_INIT FOR
* "org.h2.fulltext.FullTextLucene.init";
* CALL FTL_INIT();
*
*
* @param conn the connection
*/
public static void init(Connection conn) throws SQLException {
Statement stat = conn.createStatement();
stat.execute("CREATE SCHEMA IF NOT EXISTS " + SCHEMA);
stat.execute("CREATE TABLE IF NOT EXISTS " + SCHEMA +
".INDEXES(SCHEMA VARCHAR, TABLE VARCHAR, " +
"COLUMNS VARCHAR, PRIMARY KEY(SCHEMA, TABLE))");
stat.execute("CREATE ALIAS IF NOT EXISTS FTL_CREATE_INDEX FOR \"" +
FullTextLucene.class.getName() + ".createIndex\"");
stat.execute("CREATE ALIAS IF NOT EXISTS FTL_DROP_INDEX FOR \"" +
FullTextLucene.class.getName() + ".dropIndex\"");
stat.execute("CREATE ALIAS IF NOT EXISTS FTL_SEARCH FOR \"" +
FullTextLucene.class.getName() + ".search\"");
stat.execute("CREATE ALIAS IF NOT EXISTS FTL_SEARCH_DATA FOR \"" +
FullTextLucene.class.getName() + ".searchData\"");
stat.execute("CREATE ALIAS IF NOT EXISTS FTL_REINDEX FOR \"" +
FullTextLucene.class.getName() + ".reindex\"");
stat.execute("CREATE ALIAS IF NOT EXISTS FTL_DROP_ALL FOR \"" +
FullTextLucene.class.getName() + ".dropAll\"");
try {
getIndexAccess(conn);
} catch (SQLException e) {
throw convertException(e);
}
}
/**
* Create a new full text index for a table and column list. Each table may
* only have one index at any time.
*
* @param conn the connection
* @param schema the schema name of the table (case sensitive)
* @param table the table name (case sensitive)
* @param columnList the column list (null for all columns)
*/
public static void createIndex(Connection conn, String schema,
String table, String columnList) throws SQLException {
init(conn);
PreparedStatement prep = conn.prepareStatement("INSERT INTO " + SCHEMA
+ ".INDEXES(SCHEMA, TABLE, COLUMNS) VALUES(?, ?, ?)");
prep.setString(1, schema);
prep.setString(2, table);
prep.setString(3, columnList);
prep.execute();
createTrigger(conn, schema, table);
indexExistingRows(conn, schema, table);
}
/**
* Drop an existing full text index for a table. This method returns
* silently if no index for this table exists.
*
* @param conn the connection
* @param schema the schema name of the table (case sensitive)
* @param table the table name (case sensitive)
*/
public static void dropIndex(Connection conn, String schema, String table)
throws SQLException {
init(conn);
PreparedStatement prep = conn.prepareStatement("DELETE FROM " + SCHEMA
+ ".INDEXES WHERE SCHEMA=? AND TABLE=?");
prep.setString(1, schema);
prep.setString(2, table);
int rowCount = prep.executeUpdate();
if (rowCount == 0) {
return;
}
reindex(conn);
}
/**
* Re-creates the full text index for this database. Calling this method is
* usually not needed, as the index is kept up-to-date automatically.
*
* @param conn the connection
*/
public static void reindex(Connection conn) throws SQLException {
init(conn);
removeAllTriggers(conn, TRIGGER_PREFIX);
removeIndexFiles(conn);
Statement stat = conn.createStatement();
ResultSet rs = stat.executeQuery("SELECT * FROM " + SCHEMA + ".INDEXES");
while (rs.next()) {
String schema = rs.getString("SCHEMA");
String table = rs.getString("TABLE");
createTrigger(conn, schema, table);
indexExistingRows(conn, schema, table);
}
}
/**
* Drops all full text indexes from the database.
*
* @param conn the connection
*/
public static void dropAll(Connection conn) throws SQLException {
Statement stat = conn.createStatement();
stat.execute("DROP SCHEMA IF EXISTS " + SCHEMA);
removeAllTriggers(conn, TRIGGER_PREFIX);
removeIndexFiles(conn);
}
/**
* Searches from the full text index for this database.
* The returned result set has the following column:
* - QUERY (varchar): the query to use to get the data.
* The query does not include 'SELECT * FROM '. Example:
* PUBLIC.TEST WHERE ID = 1
*
- SCORE (float) the relevance score as returned by Lucene.
*
*
* @param conn the connection
* @param text the search query
* @param limit the maximum number of rows or 0 for no limit
* @param offset the offset or 0 for no offset
* @return the result set
*/
public static ResultSet search(Connection conn, String text, int limit,
int offset) throws SQLException {
return search(conn, text, limit, offset, false);
}
/**
* Searches from the full text index for this database. The result contains
* the primary key data as an array. The returned result set has the
* following columns:
*
* - SCHEMA (varchar): the schema name. Example: PUBLIC
* - TABLE (varchar): the table name. Example: TEST
* - COLUMNS (array of varchar): comma separated list of quoted column
* names. The column names are quoted if necessary. Example: (ID)
* - KEYS (array of values): comma separated list of values.
* Example: (1)
* - SCORE (float) the relevance score as returned by Lucene.
*
*
* @param conn the connection
* @param text the search query
* @param limit the maximum number of rows or 0 for no limit
* @param offset the offset or 0 for no offset
* @return the result set
*/
public static ResultSet searchData(Connection conn, String text, int limit,
int offset) throws SQLException {
return search(conn, text, limit, offset, true);
}
/**
* Convert an exception to a fulltext exception.
*
* @param e the original exception
* @return the converted SQL exception
*/
protected static SQLException convertException(Exception e) {
SQLException e2 = new SQLException(
"Error while indexing document", "FULLTEXT");
e2.initCause(e);
return e2;
}
/**
* Create the trigger.
*
* @param conn the database connection
* @param schema the schema name
* @param table the table name
*/
protected static void createTrigger(Connection conn, String schema,
String table) throws SQLException {
createOrDropTrigger(conn, schema, table, true);
}
private static void createOrDropTrigger(Connection conn,
String schema, String table, boolean create) throws SQLException {
Statement stat = conn.createStatement();
String trigger = StringUtils.quoteIdentifier(schema) + "." +
StringUtils.quoteIdentifier(TRIGGER_PREFIX + table);
stat.execute("DROP TRIGGER IF EXISTS " + trigger);
if (create) {
StringBuilder buff = new StringBuilder(
"CREATE TRIGGER IF NOT EXISTS ");
// the trigger is also called on rollback because transaction
// rollback will not undo the changes in the Lucene index
buff.append(trigger).
append(" AFTER INSERT, UPDATE, DELETE, ROLLBACK ON ").
append(StringUtils.quoteIdentifier(schema)).
append('.').
append(StringUtils.quoteIdentifier(table)).
append(" FOR EACH ROW CALL \"").
append(FullTextLucene.FullTextTrigger.class.getName()).
append('\"');
stat.execute(buff.toString());
}
}
/**
* Get the index writer/searcher wrapper for the given connection.
*
* @param conn the connection
* @return the index access wrapper
*/
protected static IndexAccess getIndexAccess(Connection conn)
throws SQLException {
String path = getIndexPath(conn);
synchronized (INDEX_ACCESS) {
IndexAccess access = INDEX_ACCESS.get(path);
if (access == null) {
try {
Directory indexDir = path.startsWith(IN_MEMORY_PREFIX) ?
new RAMDirectory() : FSDirectory.open(new File(path));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_30, analyzer);
conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
IndexWriter writer = new IndexWriter(indexDir, conf);
//see http://wiki.apache.org/lucene-java/NearRealtimeSearch
IndexReader reader = IndexReader.open(writer, true);
access = new IndexAccess();
access.writer = writer;
access.reader = reader;
access.searcher = new IndexSearcher(reader);
} catch (IOException e) {
throw convertException(e);
}
INDEX_ACCESS.put(path, access);
}
return access;
}
}
/**
* Get the path of the Lucene index for this database.
*
* @param conn the database connection
* @return the path
*/
protected static String getIndexPath(Connection conn) throws SQLException {
Statement stat = conn.createStatement();
ResultSet rs = stat.executeQuery("CALL DATABASE_PATH()");
rs.next();
String path = rs.getString(1);
if (path == null) {
return IN_MEMORY_PREFIX + conn.getCatalog();
}
int index = path.lastIndexOf(':');
// position 1 means a windows drive letter is used, ignore that
if (index > 1) {
path = path.substring(index + 1);
}
rs.close();
return path;
}
/**
* Add the existing data to the index.
*
* @param conn the database connection
* @param schema the schema name
* @param table the table name
*/
protected static void indexExistingRows(Connection conn, String schema,
String table) throws SQLException {
FullTextLucene.FullTextTrigger existing = new FullTextLucene.FullTextTrigger();
existing.init(conn, schema, null, table, false, Trigger.INSERT);
String sql = "SELECT * FROM " + StringUtils.quoteIdentifier(schema) +
"." + StringUtils.quoteIdentifier(table);
ResultSet rs = conn.createStatement().executeQuery(sql);
int columnCount = rs.getMetaData().getColumnCount();
while (rs.next()) {
Object[] row = new Object[columnCount];
for (int i = 0; i < columnCount; i++) {
row[i] = rs.getObject(i + 1);
}
existing.insert(row, false);
}
existing.commitIndex();
}
private static void removeIndexFiles(Connection conn) throws SQLException {
String path = getIndexPath(conn);
IndexAccess access = INDEX_ACCESS.get(path);
if (access != null) {
removeIndexAccess(access, path);
}
if (!path.startsWith(IN_MEMORY_PREFIX)) {
FileUtils.deleteRecursive(path, false);
}
}
/**
* Close the index writer and searcher and remove them from the index access
* set.
*
* @param access the index writer/searcher wrapper
* @param indexPath the index path
*/
protected static void removeIndexAccess(IndexAccess access, String indexPath)
throws SQLException {
synchronized (INDEX_ACCESS) {
try {
INDEX_ACCESS.remove(indexPath);
access.searcher.close();
access.reader.close();
access.writer.close();
} catch (Exception e) {
throw convertException(e);
}
}
}
/**
* Do the search.
*
* @param conn the database connection
* @param text the query
* @param limit the limit
* @param offset the offset
* @param data whether the raw data should be returned
* @return the result set
*/
protected static ResultSet search(Connection conn, String text,
int limit, int offset, boolean data) throws SQLException {
SimpleResultSet result = createResultSet(data);
if (conn.getMetaData().getURL().startsWith("jdbc:columnlist:")) {
// this is just to query the result set columns
return result;
}
if (text == null || text.trim().length() == 0) {
return result;
}
try {
IndexAccess access = getIndexAccess(conn);
// take a reference as the searcher may change
IndexSearcher searcher = access.searcher;
// reuse the same analyzer; it's thread-safe;
// also allows subclasses to control the analyzer used.
Analyzer analyzer = access.writer.getAnalyzer();
QueryParser parser = new QueryParser(Version.LUCENE_30,
LUCENE_FIELD_DATA, analyzer);
Query query = parser.parse(text);
// Lucene 3 insists on a hard limit and will not provide
// a total hits value. Take at least 100 which is
// an optimal limit for Lucene as any more
// will trigger writing results to disk.
int maxResults = (limit == 0 ? 100 : limit) + offset;
TopDocs docs = searcher.search(query, maxResults);
if (limit == 0) {
limit = docs.totalHits;
}
for (int i = 0, len = docs.scoreDocs.length;
i < limit && i + offset < docs.totalHits
&& i + offset < len; i++) {
ScoreDoc sd = docs.scoreDocs[i + offset];
Document doc = searcher.doc(sd.doc);
float score = sd.score;
String q = doc.get(LUCENE_FIELD_QUERY);
if (data) {
int idx = q.indexOf(" WHERE ");
JdbcConnection c = (JdbcConnection) conn;
Session session = (Session) c.getSession();
Parser p = new Parser(session);
String tab = q.substring(0, idx);
ExpressionColumn expr = (ExpressionColumn) p.parseExpression(tab);
String schemaName = expr.getOriginalTableAliasName();
String tableName = expr.getColumnName();
q = q.substring(idx + " WHERE ".length());
Object[][] columnData = parseKey(conn, q);
result.addRow(
schemaName,
tableName,
columnData[0],
columnData[1],
score);
} else {
result.addRow(q, score);
}
}
} catch (Exception e) {
throw convertException(e);
}
return result;
}
/**
* Trigger updates the index when a inserting, updating, or deleting a row.
*/
public static class FullTextTrigger implements Trigger {
protected String schema;
protected String table;
protected int[] keys;
protected int[] indexColumns;
protected String[] columns;
protected int[] columnTypes;
protected String indexPath;
protected IndexAccess indexAccess;
/**
* INTERNAL
*/
@Override
public void init(Connection conn, String schemaName, String triggerName,
String tableName, boolean before, int type) throws SQLException {
this.schema = schemaName;
this.table = tableName;
this.indexPath = getIndexPath(conn);
this.indexAccess = getIndexAccess(conn);
ArrayList keyList = New.arrayList();
DatabaseMetaData meta = conn.getMetaData();
ResultSet rs = meta.getColumns(null,
StringUtils.escapeMetaDataPattern(schemaName),
StringUtils.escapeMetaDataPattern(tableName),
null);
ArrayList columnList = New.arrayList();
while (rs.next()) {
columnList.add(rs.getString("COLUMN_NAME"));
}
columnTypes = new int[columnList.size()];
columns = new String[columnList.size()];
columnList.toArray(columns);
rs = meta.getColumns(null,
StringUtils.escapeMetaDataPattern(schemaName),
StringUtils.escapeMetaDataPattern(tableName),
null);
for (int i = 0; rs.next(); i++) {
columnTypes[i] = rs.getInt("DATA_TYPE");
}
if (keyList.size() == 0) {
rs = meta.getPrimaryKeys(null,
StringUtils.escapeMetaDataPattern(schemaName),
tableName);
while (rs.next()) {
keyList.add(rs.getString("COLUMN_NAME"));
}
}
if (keyList.size() == 0) {
throw throwException("No primary key for table " + tableName);
}
ArrayList indexList = New.arrayList();
PreparedStatement prep = conn.prepareStatement(
"SELECT COLUMNS FROM " + SCHEMA
+ ".INDEXES WHERE SCHEMA=? AND TABLE=?");
prep.setString(1, schemaName);
prep.setString(2, tableName);
rs = prep.executeQuery();
if (rs.next()) {
String cols = rs.getString(1);
if (cols != null) {
for (String s : StringUtils.arraySplit(cols, ',', true)) {
indexList.add(s);
}
}
}
if (indexList.size() == 0) {
indexList.addAll(columnList);
}
keys = new int[keyList.size()];
setColumns(keys, keyList, columnList);
indexColumns = new int[indexList.size()];
setColumns(indexColumns, indexList, columnList);
}
/**
* INTERNAL
*/
@Override
public void fire(Connection conn, Object[] oldRow, Object[] newRow)
throws SQLException {
if (oldRow != null) {
if (newRow != null) {
// update
if (hasChanged(oldRow, newRow, indexColumns)) {
delete(oldRow, false);
insert(newRow, true);
}
} else {
// delete
delete(oldRow, true);
}
} else if (newRow != null) {
// insert
insert(newRow, true);
}
}
/**
* INTERNAL
*/
@Override
public void close() throws SQLException {
if (indexAccess != null) {
removeIndexAccess(indexAccess, indexPath);
indexAccess = null;
}
}
/**
* INTERNAL
*/
@Override
public void remove() {
// ignore
}
/**
* Commit all changes to the Lucene index.
*/
void commitIndex() throws SQLException {
try {
indexAccess.writer.commit();
// recreate Searcher with the IndexWriter's reader.
indexAccess.searcher.close();
indexAccess.reader.close();
indexAccess.reader = IndexReader.open(indexAccess.writer, true);
indexAccess.searcher = new IndexSearcher(indexAccess.reader);
} catch (IOException e) {
throw convertException(e);
}
}
/**
* Add a row to the index.
*
* @param row the row
* @param commitIndex whether to commit the changes to the Lucene index
*/
protected void insert(Object[] row, boolean commitIndex) throws SQLException {
String query = getQuery(row);
Document doc = new Document();
doc.add(new Field(LUCENE_FIELD_QUERY, query,
Field.Store.YES, Field.Index.NOT_ANALYZED));
long time = System.currentTimeMillis();
doc.add(new Field(LUCENE_FIELD_MODIFIED,
DateTools.timeToString(time, DateTools.Resolution.SECOND),
Field.Store.YES, Field.Index.NOT_ANALYZED));
StatementBuilder buff = new StatementBuilder();
for (int index : indexColumns) {
String columnName = columns[index];
String data = asString(row[index], columnTypes[index]);
// column names that start with _
// must be escaped to avoid conflicts
// with internal field names (_DATA, _QUERY, _modified)
if (columnName.startsWith(LUCENE_FIELD_COLUMN_PREFIX)) {
columnName = LUCENE_FIELD_COLUMN_PREFIX + columnName;
}
doc.add(new Field(columnName, data,
Field.Store.NO, Field.Index.ANALYZED));
buff.appendExceptFirst(" ");
buff.append(data);
}
Field.Store storeText = STORE_DOCUMENT_TEXT_IN_INDEX ?
Field.Store.YES : Field.Store.NO;
doc.add(new Field(LUCENE_FIELD_DATA, buff.toString(), storeText,
Field.Index.ANALYZED));
try {
indexAccess.writer.addDocument(doc);
if (commitIndex) {
commitIndex();
}
} catch (IOException e) {
throw convertException(e);
}
}
/**
* Delete a row from the index.
*
* @param row the row
* @param commitIndex whether to commit the changes to the Lucene index
*/
protected void delete(Object[] row, boolean commitIndex) throws SQLException {
String query = getQuery(row);
try {
Term term = new Term(LUCENE_FIELD_QUERY, query);
indexAccess.writer.deleteDocuments(term);
if (commitIndex) {
commitIndex();
}
} catch (IOException e) {
throw convertException(e);
}
}
private String getQuery(Object[] row) throws SQLException {
StatementBuilder buff = new StatementBuilder();
if (schema != null) {
buff.append(StringUtils.quoteIdentifier(schema)).append('.');
}
buff.append(StringUtils.quoteIdentifier(table)).append(" WHERE ");
for (int columnIndex : keys) {
buff.appendExceptFirst(" AND ");
buff.append(StringUtils.quoteIdentifier(columns[columnIndex]));
Object o = row[columnIndex];
if (o == null) {
buff.append(" IS NULL");
} else {
buff.append('=').append(FullText.quoteSQL(o, columnTypes[columnIndex]));
}
}
return buff.toString();
}
}
/**
* A wrapper for the Lucene writer and searcher.
*/
static class IndexAccess {
/**
* The index writer.
*/
IndexWriter writer;
/**
* The index reader.
*/
IndexReader reader;
/**
* The index searcher.
*/
IndexSearcher searcher;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy