All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.h2.fulltext.FullTextLucene Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2004-2023 H2 Group. Multiple-Licensed under the MPL 2.0,
 * and the EPL 1.0 (https://h2database.com/html/license.html).
 * Initial Developer: H2 Group
 */
package org.h2.fulltext;

import java.io.IOException;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.DatabaseMetaData;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.h2.api.Trigger;
import org.h2.command.Parser;
import org.h2.engine.SessionLocal;
import org.h2.expression.ExpressionColumn;
import org.h2.jdbc.JdbcConnection;
import org.h2.store.fs.FileUtils;
import org.h2.tools.SimpleResultSet;
import org.h2.util.StringUtils;
import org.h2.util.Utils;

/**
 * This class implements the full text search based on Apache Lucene.
 * Most methods can be called using SQL statements as well.
 */
public class FullTextLucene extends FullText {

    /**
     * Whether the text content should be stored in the Lucene index.
     */
    protected static final boolean STORE_DOCUMENT_TEXT_IN_INDEX =
            Utils.getProperty("h2.storeDocumentTextInIndex", false);

    private static final HashMap INDEX_ACCESS = new HashMap<>();
    private static final String TRIGGER_PREFIX = "FTL_";
    private static final String SCHEMA = "FTL";
    private static final String LUCENE_FIELD_DATA = "_DATA";
    private static final String LUCENE_FIELD_QUERY = "_QUERY";
    private static final String LUCENE_FIELD_MODIFIED = "_modified";
    private static final String LUCENE_FIELD_COLUMN_PREFIX = "_";

    /**
     * The prefix for a in-memory path. This prefix is only used internally
     * within this class and not related to the database URL.
     */
    private static final String IN_MEMORY_PREFIX = "mem:";

    /**
     * Initializes full text search functionality for this database. This adds
     * the following Java functions to the database:
     * 
    *
  • FTL_CREATE_INDEX(schemaNameString, tableNameString, * columnListString)
  • *
  • FTL_SEARCH(queryString, limitInt, offsetInt): result set
  • *
  • FTL_REINDEX()
  • *
  • FTL_DROP_ALL()
  • *
* It also adds a schema FTL to the database where bookkeeping information * is stored. This function may be called from a Java application, or by * using the SQL statements: * *
     * CREATE ALIAS IF NOT EXISTS FTL_INIT FOR
     *      "org.h2.fulltext.FullTextLucene.init";
     * CALL FTL_INIT();
     * 
* * @param conn the connection * @throws SQLException on failure */ public static void init(Connection conn) throws SQLException { try (Statement stat = conn.createStatement()) { stat.execute("CREATE SCHEMA IF NOT EXISTS " + SCHEMA); stat.execute("CREATE TABLE IF NOT EXISTS " + SCHEMA + ".INDEXES(SCHEMA VARCHAR, `TABLE` VARCHAR, " + "COLUMNS VARCHAR, PRIMARY KEY(SCHEMA, `TABLE`))"); String className = FullTextLucene.class.getName(); stat.execute("CREATE ALIAS IF NOT EXISTS FTL_CREATE_INDEX FOR '" + className + ".createIndex'"); stat.execute("CREATE ALIAS IF NOT EXISTS FTL_DROP_INDEX FOR '" + className + ".dropIndex'"); stat.execute("CREATE ALIAS IF NOT EXISTS FTL_SEARCH FOR '" + className + ".search'"); stat.execute("CREATE ALIAS IF NOT EXISTS FTL_SEARCH_DATA FOR '" + className + ".searchData'"); stat.execute("CREATE ALIAS IF NOT EXISTS FTL_REINDEX FOR '" + className + ".reindex'"); stat.execute("CREATE ALIAS IF NOT EXISTS FTL_DROP_ALL FOR '" + className + ".dropAll'"); } } /** * Create a new full text index for a table and column list. Each table may * only have one index at any time. * * @param conn the connection * @param schema the schema name of the table (case sensitive) * @param table the table name (case sensitive) * @param columnList the column list (null for all columns) * @throws SQLException on failure */ public static void createIndex(Connection conn, String schema, String table, String columnList) throws SQLException { init(conn); PreparedStatement prep = conn.prepareStatement("INSERT INTO " + SCHEMA + ".INDEXES(SCHEMA, `TABLE`, COLUMNS) VALUES(?, ?, ?)"); prep.setString(1, schema); prep.setString(2, table); prep.setString(3, columnList); prep.execute(); createTrigger(conn, schema, table); indexExistingRows(conn, schema, table); } /** * Drop an existing full text index for a table. This method returns * silently if no index for this table exists. * * @param conn the connection * @param schema the schema name of the table (case sensitive) * @param table the table name (case sensitive) * @throws SQLException on failure */ public static void dropIndex(Connection conn, String schema, String table) throws SQLException { init(conn); PreparedStatement prep = conn.prepareStatement("DELETE FROM " + SCHEMA + ".INDEXES WHERE SCHEMA=? AND `TABLE`=?"); prep.setString(1, schema); prep.setString(2, table); int rowCount = prep.executeUpdate(); if (rowCount != 0) { reindex(conn); } } /** * Re-creates the full text index for this database. Calling this method is * usually not needed, as the index is kept up-to-date automatically. * * @param conn the connection * @throws SQLException on failure */ public static void reindex(Connection conn) throws SQLException { init(conn); removeAllTriggers(conn, TRIGGER_PREFIX); removeIndexFiles(conn); Statement stat = conn.createStatement(); ResultSet rs = stat.executeQuery("SELECT * FROM " + SCHEMA + ".INDEXES"); while (rs.next()) { String schema = rs.getString("SCHEMA"); String table = rs.getString("TABLE"); createTrigger(conn, schema, table); indexExistingRows(conn, schema, table); } } /** * Drops all full text indexes from the database. * * @param conn the connection * @throws SQLException on failure */ public static void dropAll(Connection conn) throws SQLException { Statement stat = conn.createStatement(); stat.execute("DROP SCHEMA IF EXISTS " + SCHEMA + " CASCADE"); removeAllTriggers(conn, TRIGGER_PREFIX); removeIndexFiles(conn); } /** * Searches from the full text index for this database. * The returned result set has the following column: *
  • QUERY (varchar): the query to use to get the data. * The query does not include 'SELECT * FROM '. Example: * PUBLIC.TEST WHERE ID = 1 *
  • SCORE (float) the relevance score as returned by Lucene. *
* * @param conn the connection * @param text the search query * @param limit the maximum number of rows or 0 for no limit * @param offset the offset or 0 for no offset * @return the result set * @throws SQLException on failure */ public static ResultSet search(Connection conn, String text, int limit, int offset) throws SQLException { return search(conn, text, limit, offset, false); } /** * Searches from the full text index for this database. The result contains * the primary key data as an array. The returned result set has the * following columns: *
    *
  • SCHEMA (varchar): the schema name. Example: PUBLIC
  • *
  • TABLE (varchar): the table name. Example: TEST
  • *
  • COLUMNS (array of varchar): comma separated list of quoted column * names. The column names are quoted if necessary. Example: (ID)
  • *
  • KEYS (array of values): comma separated list of values. * Example: (1)
  • *
  • SCORE (float) the relevance score as returned by Lucene.
  • *
* * @param conn the connection * @param text the search query * @param limit the maximum number of rows or 0 for no limit * @param offset the offset or 0 for no offset * @return the result set * @throws SQLException on failure */ public static ResultSet searchData(Connection conn, String text, int limit, int offset) throws SQLException { return search(conn, text, limit, offset, true); } /** * Convert an exception to a fulltext exception. * * @param e the original exception * @return the converted SQL exception */ protected static SQLException convertException(Exception e) { return new SQLException("Error while indexing document", "FULLTEXT", e); } /** * Create the trigger. * * @param conn the database connection * @param schema the schema name * @param table the table name * @throws SQLException on failure */ private static void createTrigger(Connection conn, String schema, String table) throws SQLException { createOrDropTrigger(conn, schema, table, true); } private static void createOrDropTrigger(Connection conn, String schema, String table, boolean create) throws SQLException { Statement stat = conn.createStatement(); String trigger = StringUtils.quoteIdentifier(schema) + "." + StringUtils.quoteIdentifier(TRIGGER_PREFIX + table); stat.execute("DROP TRIGGER IF EXISTS " + trigger); if (create) { StringBuilder builder = new StringBuilder( "CREATE TRIGGER IF NOT EXISTS "); // the trigger is also called on rollback because transaction // rollback will not undo the changes in the Lucene index builder.append(trigger). append(" AFTER INSERT, UPDATE, DELETE, ROLLBACK ON "); StringUtils.quoteIdentifier(builder, schema). append('.'); StringUtils.quoteIdentifier(builder, table). append(" FOR EACH ROW CALL \""). append(FullTextLucene.FullTextTrigger.class.getName()). append('\"'); stat.execute(builder.toString()); } } /** * Get the index writer/searcher wrapper for the given connection. * * @param conn the connection * @return the index access wrapper * @throws SQLException on failure */ protected static IndexAccess getIndexAccess(Connection conn) throws SQLException { String path = getIndexPath(conn); synchronized (INDEX_ACCESS) { IndexAccess access = INDEX_ACCESS.get(path); while (access == null) { try { Directory indexDir = path.startsWith(IN_MEMORY_PREFIX) ? new ByteBuffersDirectory() : FSDirectory.open(Paths.get(path)); Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig conf = new IndexWriterConfig(analyzer); conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(indexDir, conf); //see https://cwiki.apache.org/confluence/display/lucene/NearRealtimeSearch access = new IndexAccess(writer); } catch (IndexFormatTooOldException e) { reindex(conn); continue; } catch (IOException e) { throw convertException(e); } INDEX_ACCESS.put(path, access); break; } return access; } } /** * Get the path of the Lucene index for this database. * * @param conn the database connection * @return the path * @throws SQLException on failure */ protected static String getIndexPath(Connection conn) throws SQLException { Statement stat = conn.createStatement(); ResultSet rs = stat.executeQuery("CALL DATABASE_PATH()"); rs.next(); String path = rs.getString(1); if (path == null) { return IN_MEMORY_PREFIX + conn.getCatalog(); } int index = path.lastIndexOf(':'); // position 1 means a windows drive letter is used, ignore that if (index > 1) { path = path.substring(index + 1); } rs.close(); return path; } /** * Add the existing data to the index. * * @param conn the database connection * @param schema the schema name * @param table the table name * @throws SQLException on failure */ private static void indexExistingRows(Connection conn, String schema, String table) throws SQLException { FullTextLucene.FullTextTrigger existing = new FullTextLucene.FullTextTrigger(); existing.init(conn, schema, null, table, false, Trigger.INSERT); String sql = "SELECT * FROM " + StringUtils.quoteIdentifier(schema) + "." + StringUtils.quoteIdentifier(table); ResultSet rs = conn.createStatement().executeQuery(sql); int columnCount = rs.getMetaData().getColumnCount(); while (rs.next()) { Object[] row = new Object[columnCount]; for (int i = 0; i < columnCount; i++) { row[i] = rs.getObject(i + 1); } existing.insert(row, false); } existing.commitIndex(); } private static void removeIndexFiles(Connection conn) throws SQLException { String path = getIndexPath(conn); removeIndexAccess(path); if (!path.startsWith(IN_MEMORY_PREFIX)) { FileUtils.deleteRecursive(path, false); } } /** * Close the index writer and searcher and remove them from the index access * set. * * @param indexPath the index path * @throws SQLException on failure */ protected static void removeIndexAccess(String indexPath) throws SQLException { synchronized (INDEX_ACCESS) { try { IndexAccess access = INDEX_ACCESS.remove(indexPath); if(access != null) { access.close(); } } catch (Exception e) { throw convertException(e); } } } /** * Do the search. * * @param conn the database connection * @param text the query * @param limit the limit * @param offset the offset * @param data whether the raw data should be returned * @return the result set * @throws SQLException on failure */ protected static ResultSet search(Connection conn, String text, int limit, int offset, boolean data) throws SQLException { SimpleResultSet result = createResultSet(data); if (conn.getMetaData().getURL().startsWith("jdbc:columnlist:")) { // this is just to query the result set columns return result; } if (text == null || StringUtils.isWhitespaceOrEmpty(text)) { return result; } try { IndexAccess access = getIndexAccess(conn); // take a reference as the searcher may change IndexSearcher searcher = access.getSearcher(); try { // reuse the same analyzer; it's thread-safe; // also allows subclasses to control the analyzer used. Analyzer analyzer = access.writer.getAnalyzer(); StandardQueryParser parser = new StandardQueryParser(analyzer); Query query = parser.parse(text, LUCENE_FIELD_DATA); // Lucene insists on a hard limit and will not provide // a total hits value. Take at least 100 which is // an optimal limit for Lucene as any more // will trigger writing results to disk. int maxResults = (limit == 0 ? 100 : limit) + offset; TopDocs docs = searcher.search(query, maxResults); long totalHits = docs.totalHits.value; if (limit == 0) { // in this context it's safe to cast limit = (int) totalHits; } for (int i = 0, len = docs.scoreDocs.length; i < limit && i + offset < totalHits && i + offset < len; i++) { ScoreDoc sd = docs.scoreDocs[i + offset]; Document doc = searcher.doc(sd.doc); float score = sd.score; String q = doc.get(LUCENE_FIELD_QUERY); if (data) { int idx = q.indexOf(" WHERE "); JdbcConnection c = (JdbcConnection) conn; SessionLocal session = (SessionLocal) c.getSession(); Parser p = new Parser(session); String tab = q.substring(0, idx); ExpressionColumn expr = (ExpressionColumn) p .parseExpression(tab); String schemaName = expr.getOriginalTableAliasName(); String tableName = expr.getColumnName(session, -1); q = q.substring(idx + " WHERE ".length()); String[][] columnData = parseKey(conn, q); result.addRow(schemaName, tableName, columnData[0], columnData[1], score); } else { result.addRow(q, score); } } } finally { access.returnSearcher(searcher); } } catch (Exception e) { throw convertException(e); } return result; } /** * Trigger updates the index when a inserting, updating, or deleting a row. */ public static final class FullTextTrigger implements Trigger { private String schema; private String table; private int[] keys; private int[] indexColumns; private String[] columns; private int[] columnTypes; private String indexPath; private IndexAccess indexAccess; private final FieldType DOC_ID_FIELD_TYPE; public FullTextTrigger() { DOC_ID_FIELD_TYPE = new FieldType(TextField.TYPE_STORED); DOC_ID_FIELD_TYPE.setTokenized(false); DOC_ID_FIELD_TYPE.freeze(); } /** * INTERNAL * @see Trigger#init(Connection, String, String, String, boolean, int) */ @Override public void init(Connection conn, String schemaName, String triggerName, String tableName, boolean before, int type) throws SQLException { this.schema = schemaName; this.table = tableName; this.indexPath = getIndexPath(conn); this.indexAccess = getIndexAccess(conn); ArrayList keyList = Utils.newSmallArrayList(); DatabaseMetaData meta = conn.getMetaData(); ResultSet rs = meta.getColumns(null, StringUtils.escapeMetaDataPattern(schemaName), StringUtils.escapeMetaDataPattern(tableName), null); ArrayList columnList = Utils.newSmallArrayList(); while (rs.next()) { columnList.add(rs.getString("COLUMN_NAME")); } columnTypes = new int[columnList.size()]; columns = columnList.toArray(new String[0]); rs = meta.getColumns(null, StringUtils.escapeMetaDataPattern(schemaName), StringUtils.escapeMetaDataPattern(tableName), null); for (int i = 0; rs.next(); i++) { columnTypes[i] = rs.getInt("DATA_TYPE"); } if (keyList.isEmpty()) { rs = meta.getPrimaryKeys(null, StringUtils.escapeMetaDataPattern(schemaName), tableName); while (rs.next()) { keyList.add(rs.getString("COLUMN_NAME")); } } if (keyList.isEmpty()) { throw throwException("No primary key for table " + tableName); } ArrayList indexList = Utils.newSmallArrayList(); PreparedStatement prep = conn.prepareStatement( "SELECT COLUMNS FROM " + SCHEMA + ".INDEXES WHERE SCHEMA=? AND `TABLE`=?"); prep.setString(1, schemaName); prep.setString(2, tableName); rs = prep.executeQuery(); if (rs.next()) { String cols = rs.getString(1); if (cols != null) { Collections.addAll(indexList, StringUtils.arraySplit(cols, ',', true)); } } if (indexList.isEmpty()) { indexList.addAll(columnList); } keys = new int[keyList.size()]; setColumns(keys, keyList, columnList); indexColumns = new int[indexList.size()]; setColumns(indexColumns, indexList, columnList); } /** * INTERNAL * @see Trigger#fire(Connection, Object[], Object[]) */ @Override public void fire(Connection conn, Object[] oldRow, Object[] newRow) throws SQLException { if (oldRow != null) { if (newRow != null) { // update if (hasChanged(oldRow, newRow, indexColumns)) { delete(oldRow, false); insert(newRow, true); } } else { // delete delete(oldRow, true); } } else if (newRow != null) { // insert insert(newRow, true); } } /** * INTERNAL */ @Override public void close() throws SQLException { removeIndexAccess(indexPath); } /** * Commit all changes to the Lucene index. * @throws SQLException on failure */ void commitIndex() throws SQLException { try { indexAccess.commit(); } catch (IOException e) { throw convertException(e); } } /** * Add a row to the index. * * @param row the row * @param commitIndex whether to commit the changes to the Lucene index * @throws SQLException on failure */ void insert(Object[] row, boolean commitIndex) throws SQLException { String query = getQuery(row); Document doc = new Document(); doc.add(new Field(LUCENE_FIELD_QUERY, query, DOC_ID_FIELD_TYPE)); long time = System.currentTimeMillis(); doc.add(new Field(LUCENE_FIELD_MODIFIED, DateTools.timeToString(time, DateTools.Resolution.SECOND), TextField.TYPE_STORED)); StringBuilder builder = new StringBuilder(); for (int i = 0, length = indexColumns.length; i < length; i++) { int index = indexColumns[i]; String columnName = columns[index]; String data = asString(row[index], columnTypes[index]); // column names that start with _ // must be escaped to avoid conflicts // with internal field names (_DATA, _QUERY, _modified) if (columnName.startsWith(LUCENE_FIELD_COLUMN_PREFIX)) { columnName = LUCENE_FIELD_COLUMN_PREFIX + columnName; } doc.add(new Field(columnName, data, TextField.TYPE_NOT_STORED)); if (i > 0) { builder.append(' '); } builder.append(data); } FieldType dataFieldType = STORE_DOCUMENT_TEXT_IN_INDEX ? TextField.TYPE_STORED : TextField.TYPE_NOT_STORED; doc.add(new Field(LUCENE_FIELD_DATA, builder.toString(), dataFieldType)); try { indexAccess.writer.addDocument(doc); if (commitIndex) { commitIndex(); } } catch (IOException e) { throw convertException(e); } } /** * Delete a row from the index. * * @param row the row * @param commitIndex whether to commit the changes to the Lucene index * @throws SQLException on failure */ private void delete(Object[] row, boolean commitIndex) throws SQLException { String query = getQuery(row); try { Term term = new Term(LUCENE_FIELD_QUERY, query); indexAccess.writer.deleteDocuments(term); if (commitIndex) { commitIndex(); } } catch (IOException e) { throw convertException(e); } } private String getQuery(Object[] row) throws SQLException { StringBuilder builder = new StringBuilder(); if (schema != null) { StringUtils.quoteIdentifier(builder, schema).append('.'); } StringUtils.quoteIdentifier(builder, table).append(" WHERE "); for (int i = 0, length = keys.length; i < length; i++) { if (i > 0) { builder.append(" AND "); } int columnIndex = keys[i]; StringUtils.quoteIdentifier(builder, columns[columnIndex]); Object o = row[columnIndex]; if (o == null) { builder.append(" IS NULL"); } else { builder.append('=').append(FullText.quoteSQL(o, columnTypes[columnIndex])); } } return builder.toString(); } } /** * A wrapper for the Lucene writer and searcher. */ private static final class IndexAccess { /** * The index writer. */ final IndexWriter writer; /** * The index searcher. */ private IndexSearcher searcher; IndexAccess(IndexWriter writer) throws IOException { this.writer = writer; initializeSearcher(); } /** * Start using the searcher. * * @return the searcher * @throws IOException on failure */ synchronized IndexSearcher getSearcher() throws IOException { if (!searcher.getIndexReader().tryIncRef()) { initializeSearcher(); } return searcher; } private void initializeSearcher() throws IOException { IndexReader reader = DirectoryReader.open(writer); searcher = new IndexSearcher(reader); } /** * Stop using the searcher. * * @param searcher the searcher * @throws IOException on failure */ synchronized void returnSearcher(IndexSearcher searcher) throws IOException { searcher.getIndexReader().decRef(); } /** * Commit the changes. * @throws IOException on failure */ public synchronized void commit() throws IOException { writer.commit(); returnSearcher(searcher); searcher = new IndexSearcher(DirectoryReader.open(writer)); } /** * Close the index. * @throws IOException on failure */ public synchronized void close() throws IOException { searcher = null; writer.close(); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy