
prerna.engine.impl.vector.FaissDatabaseEngine Maven / Gradle / Ivy
The newest version!
package prerna.engine.impl.vector;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import prerna.cluster.util.ClusterUtil;
import prerna.cluster.util.CopyFilesToEngineRunner;
import prerna.cluster.util.DeleteFilesFromEngineRunner;
import prerna.ds.py.PyUtils;
import prerna.engine.api.VectorDatabaseTypeEnum;
import prerna.om.Insight;
import prerna.query.querystruct.filters.AndQueryFilter;
import prerna.query.querystruct.filters.BetweenQueryFilter;
import prerna.query.querystruct.filters.IQueryFilter;
import prerna.query.querystruct.filters.OrQueryFilter;
import prerna.query.querystruct.filters.SimpleQueryFilter;
import prerna.query.querystruct.filters.SimpleQueryFilter.FILTER_TYPE;
import prerna.query.querystruct.selectors.IQuerySelector;
import prerna.query.querystruct.selectors.QueryColumnSelector;
import prerna.query.querystruct.selectors.QueryConstantSelector;
import prerna.reactor.qs.SubQueryExpression;
import prerna.reactor.vector.VectorDatabaseParamOptionsEnum;
import prerna.sablecc2.om.PixelDataType;
import prerna.sablecc2.om.nounmeta.NounMetadata;
import prerna.util.Constants;
import prerna.util.Utility;
import prerna.util.sql.AbstractSqlQueryUtil;
public class FaissDatabaseEngine extends AbstractVectorDatabaseEngine {
private static final Logger classLogger = LogManager.getLogger(FaissDatabaseEngine.class);
private String vectorDatabaseSearcher = null;
@Override
public void open(Properties smssProp) throws Exception {
super.open(smssProp);
this.vectorDatabaseSearcher = Utility.getRandomString(6);
}
@Override
protected String[] getServerStartCommands() {
String faissInitScript = this.vectorDatabaseSearcher+"=vector_database.FAISSDatabase("
+ "embedder_engine_id = '${EMBEDDER_ENGINE_ID}', "
+ "tokenizer = cfg_tokenizer, "
+ "keyword_engine_id = '${KEYWORD_ENGINE_ID}', "
+ "distance_method = '${DISTANCE_METHOD}')";
String [] commands = (TOKENIZER_INIT_SCRIPT+faissInitScript).split(PyUtils.PY_COMMAND_SEPARATOR);
// need to iterate through and potential spin up tables themselves
if (this.indexClasses.size() > 0) {
ArrayList modifiedCommands = new ArrayList<>(Arrays.asList(commands));
for (String indexClass : this.indexClasses) {
File fileToCheck = new File(this.schemaFolder.getAbsolutePath() + DIR_SEPARATOR + indexClass, "dataset.pkl");
modifiedCommands.add(this.vectorDatabaseSearcher+".create_searcher(searcher_name = '"+indexClass+"', base_path = '"+fileToCheck.getParent().replace(FILE_SEPARATOR, DIR_SEPARATOR) + DIR_SEPARATOR +"')");
if (fileToCheck.exists()) {
modifiedCommands.add(this.vectorDatabaseSearcher+".searchers['"+indexClass+"'].load_dataset('"+fileToCheck.getParent().replace(FILE_SEPARATOR, DIR_SEPARATOR) + DIR_SEPARATOR +"' + 'dataset.pkl')");
modifiedCommands.add(this.vectorDatabaseSearcher+".searchers['"+indexClass+"'].load_encoded_vectors('"+fileToCheck.getParent().replace(FILE_SEPARATOR, DIR_SEPARATOR) + DIR_SEPARATOR +"' + 'vectors.pkl')");
}
}
commands = modifiedCommands.stream().toArray(String[]::new);
}
return commands;
}
@Override
protected String getDefaultDistanceMethod() {
return "Cosine Similarity";
}
protected void addIndexClass(String indexClass) {
if (!modelPropsLoaded) {
verifyModelProps();
}
checkSocketStatus();
this.indexClasses.add(indexClass);
//TODO: do we really need base path for this?
String basePath = this.schemaFolder.getAbsolutePath().replace(FILE_SEPARATOR, DIR_SEPARATOR) + DIR_SEPARATOR + indexClass + DIR_SEPARATOR;
this.pyt.runScript(this.vectorDatabaseSearcher + ".create_searcher(searcher_name = '"+indexClass+"', base_path = '"+ basePath +"')");
}
@Override
protected void cleanUpAddDocument(File indexFilesFolder) {
// do nothing, we need these files for re-creating the master file index
// try {
// FileUtils.forceDelete(indexFilesFolder);
// } catch (IOException e) {
// classLogger.error(Constants.STACKTRACE, e);
// }
}
@Override
public void addEmbeddings(List vectorCsvFiles, Insight insight, Map parameters) throws Exception {
if (!modelPropsLoaded) {
verifyModelProps();
}
checkSocketStatus();
String indexClass = this.defaultIndexClass;
if (parameters.containsKey("indexClass")) {
indexClass = (String) parameters.get("indexClass");
}
if (!this.indexClasses.contains(indexClass)) {
addIndexClass(indexClass);
}
File indexDirectory = new File(this.schemaFolder, indexClass);
File documentDir = new File(indexDirectory, DOCUMENTS_FOLDER_NAME);
File indexFilesDir = new File(indexDirectory, INDEXED_FOLDER_NAME);
if(!documentDir.exists()) {
documentDir.mkdirs();
}
if(!indexFilesDir.exists()) {
indexFilesDir.mkdirs();
}
// track files to push to cloud
Set filesToCopyToCloud = new HashSet();
// check that the vectorCsvFiles are in the current engine folder
// if not, move them
for (int i = 0; i < vectorCsvFiles.size(); i++) {
String vectorCsvFile = vectorCsvFiles.get(i);
File vectorF = new File(Utility.normalizePath(vectorCsvFile));
// double check that they are files and not directories
if (!vectorF.isFile()) {
continue;
}
if(!vectorF.getCanonicalPath().contains(documentDir.getCanonicalPath()+FILE_SEPARATOR)) {
File documentDestinationFile = new File(documentDir, vectorF.getName());
// check if the destination file exists, and if so, delete it
try {
if (documentDestinationFile.exists()) {
FileUtils.forceDelete(documentDestinationFile);
}
//only copy the csv if there is not already a file there with the same name
String baseName = FilenameUtils.getBaseName(vectorF.getName());
// Check if a file with the same base name but different extension exists
boolean fileWithSameBaseNameExists = Arrays.stream(documentDir.listFiles())
.anyMatch(file -> FilenameUtils.getBaseName(file.getName()).equals(baseName));
if(!fileWithSameBaseNameExists) {
FileUtils.copyFileToDirectory(vectorF, documentDir, true);
// store to move to cloud
filesToCopyToCloud.add(documentDestinationFile.getAbsolutePath());
}
} catch (IOException e) {
classLogger.error(Constants.STACKTRACE, e);
throw new IllegalArgumentException("Unable to remove previously created file for " + documentDestinationFile.getName() + " or move it to the document directory");
}
}
if(!vectorF.getCanonicalPath().contains(indexFilesDir.getCanonicalPath()+FILE_SEPARATOR)) {
File indexDestinationFile = new File(indexFilesDir, vectorF.getName());
// check if the destination file exists, and if so, delete it
try {
if (indexDestinationFile.exists()) {
FileUtils.forceDelete(indexDestinationFile);
}
FileUtils.copyFileToDirectory(vectorF, indexFilesDir, true);
// store to move to cloud
filesToCopyToCloud.add(indexDestinationFile.getAbsolutePath());
} catch (IOException e) {
classLogger.error(Constants.STACKTRACE, e);
throw new IllegalArgumentException("Unable to remove previously created file for " + indexDestinationFile.getName() + " or move it to the document directory");
}
// also update the reference to this folder
vectorCsvFiles.set(i, indexDestinationFile.getAbsolutePath());
}
}
// now clean the paths for python
{
List temp = new ArrayList<>(vectorCsvFiles.size());
for(int i = 0; i < vectorCsvFiles.size(); i++) {
temp.add(vectorCsvFiles.get(i).replace(FILE_SEPARATOR, DIR_SEPARATOR));
}
vectorCsvFiles = temp;
}
// assuming only content to index now
// yes... the python code is more flexible and allows you to concat multiple values in the csv to encode
String columnsToIndex = "['Content']";
// create dataset
StringBuilder addDocumentPyCommand = new StringBuilder();
// get the relevant FAISS searcher object in python
addDocumentPyCommand.append(vectorDatabaseSearcher)
.append(".searchers['")
.append(indexClass)
.append("']");
addDocumentPyCommand.append(".addDocument(documentFileLocation = ['")
.append(String.join("','", vectorCsvFiles))
.append("'], insight_id = '")
.append(insight.getInsightId())
.append("', columns_to_index = ")
.append(columnsToIndex);
if (parameters.containsKey(VectorDatabaseParamOptionsEnum.COLUMNS_TO_REMOVE.getKey())) {
// add the columns based in the vector db query
addDocumentPyCommand.append(", ")
.append("columns_to_remove")
.append(" = ")
.append(PyUtils.determineStringType(
parameters.get(
VectorDatabaseParamOptionsEnum.COLUMNS_TO_REMOVE.getKey()
)
));
}
if (parameters.containsKey(VectorDatabaseParamOptionsEnum.KEYWORD_SEARCH_PARAM.getKey())) {
// add the columns based in the vector db query
addDocumentPyCommand.append(", ")
.append("keyword_search_params")
.append(" = ")
.append(PyUtils.determineStringType(
parameters.get(
VectorDatabaseParamOptionsEnum.KEYWORD_SEARCH_PARAM.getKey()
)
));
}
addDocumentPyCommand.append(")");
String script = addDocumentPyCommand.toString();
classLogger.info("Running >>>" + script);
Map pythonResponseAfterCreatingFiles = (Map) this.pyt.runSmssWrapperEval(script, insight);
if (ClusterUtil.IS_CLUSTER) {
// this should already be handled, but just in case...
filesToCopyToCloud.addAll(vectorCsvFiles);
// and the return files (dataset/vector)
filesToCopyToCloud.addAll((List) pythonResponseAfterCreatingFiles.get("createdDocuments"));
Thread copyFilesToCloudThread = new Thread(new CopyFilesToEngineRunner(engineId, this.getCatalogType(), filesToCopyToCloud.stream().toArray(String[]::new)));
copyFilesToCloudThread.start();
}
// verify the index class loaded the dataset
StringBuilder checkForEmptyDatabase = new StringBuilder();
checkForEmptyDatabase.append(this.vectorDatabaseSearcher)
.append(".searchers['")
.append(indexClass)
.append("']")
.append(".datasetsLoaded()");
boolean datasetsLoaded = (boolean) pyt.runScript(checkForEmptyDatabase.toString());
}
@Override
public void addEmbeddings(String vectorCsvFile, Insight insight, Map parameters) throws Exception {
List vectorCsvFiles = new ArrayList<>(1);
vectorCsvFiles.add(vectorCsvFile);
addEmbeddings(vectorCsvFiles, insight, parameters);
}
@Override
public void addEmbeddingFiles(List vectorCsvFiles, Insight insight, Map parameters) throws Exception {
List vectorCsvFilePaths = new ArrayList<>(vectorCsvFiles.size());
for(int i = 0; i < vectorCsvFiles.size(); i++) {
vectorCsvFilePaths.add(vectorCsvFiles.get(i).getAbsolutePath());
}
addEmbeddings(vectorCsvFilePaths, insight, parameters);
}
@Override
public void addEmbeddingFile(File vectorCsvFile, Insight insight, Map parameters) throws Exception {
List vectorCsvFiles = new ArrayList<>(1);
vectorCsvFiles.add(vectorCsvFile.getAbsolutePath());
addEmbeddings(vectorCsvFiles, insight, parameters);
}
@Override
public void addEmbeddings(VectorDatabaseCSVTable vectorCsvTable, Insight insight, Map parameters) throws Exception {
List vectorCsvFilePaths = new ArrayList<>(1);
vectorCsvFilePaths.add(vectorCsvTable.getFile().getAbsolutePath());
addEmbeddings(vectorCsvFilePaths, insight, parameters);
}
@Override
public void removeDocument(List fileNames, Map parameters) throws IOException {
String indexClass = this.defaultIndexClass;
if (parameters.containsKey(INDEX_CLASS)) {
indexClass = (String) parameters.get(INDEX_CLASS);
}
if (!this.indexClasses.contains(indexClass)) {
throw new IllegalArgumentException("Unable to remove documents from a directory that does not exist");
}
checkSocketStatus();
List filesToRemoveFromCloud = new ArrayList();
String indexedFilesPath = this.schemaFolder.getAbsolutePath() + DIR_SEPARATOR + indexClass + DIR_SEPARATOR + "indexed_files";
Path indexDirectory = Paths.get(indexedFilesPath);
DirectoryStream stream = null;
try {
List sourceNames = new ArrayList<>();
for(String document : fileNames) {
String documentName = FilenameUtils.getName(document);
File f = new File(document);
if(f.exists() && f.getName().endsWith(".csv")) {
sourceNames.addAll(VectorDatabaseCSVTable.pullSourceColumn(f));
} else {
sourceNames.add(documentName);
}
}
for (String document : sourceNames) {
String documentName = FilenameUtils.getName(document);
String[] fileNamesToDelete = {documentName + "_dataset.pkl", documentName + "_vectors.pkl", documentName + ".csv"};
// Create a filter for the file names
DirectoryStream.Filter fileNameFilters = entry -> {
String fileName = entry.getFileName().toString();
for (String fileNameToDelete : fileNamesToDelete) {
if (fileName.equals(fileNameToDelete)) {
return true;
}
}
return false;
};
try {
stream = Files.newDirectoryStream(indexDirectory, fileNameFilters);
} catch (IOException e) {
classLogger.error(Constants.STACKTRACE, e);
throw new IllegalArgumentException("Unable determine files in " + indexDirectory.getFileName());
}
for (Path entry : stream) {
// Delete each file that matches the specified file name
try {
Files.delete(entry);
filesToRemoveFromCloud.add(entry.toString());
} catch (IOException e) {
classLogger.error(Constants.STACKTRACE, e);
throw new IllegalArgumentException("Unable to remove file: " + entry.getFileName());
}
classLogger.info("Deleted: " + entry.toString());
}
try {
File documentFile = new File(this.schemaFolder.getAbsolutePath() + DIR_SEPARATOR + indexClass + DIR_SEPARATOR + "documents", document);
if(documentFile.exists() && documentFile.isFile()) {
FileUtils.forceDelete(documentFile);
filesToRemoveFromCloud.add(documentFile.getAbsolutePath());
}
} catch (IOException e) {
classLogger.error(Constants.STACKTRACE, e);
throw new IllegalArgumentException("Unable to delete " + document + "from documents directory");
}
}
} finally {
if(stream != null) {
try {
stream.close();
} catch (IOException e) {
classLogger.error(Constants.STACKTRACE, e);
}
}
}
// this would mean the indexClass is now empty, we should delete it
File indexedFolder = new File(indexedFilesPath);
if (indexedFolder.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.endsWith(".pkl");
}
}).length == 0) {
try {
File indexClassDirectory = new File(indexedFolder.getParent());
// remove the master dataset and vector files
filesToRemoveFromCloud.add(new File(indexClassDirectory, "dataset.pkl").getAbsolutePath());
filesToRemoveFromCloud.add(new File(indexClassDirectory, "vectors.pkl").getAbsolutePath());
// delete the entire folder
FileUtils.forceDelete(indexClassDirectory);
} catch (IOException e) {
classLogger.error(Constants.STACKTRACE, e);
throw new IllegalArgumentException("Unable to delete remove the index class folder");
}
this.pyt.runScript(this.vectorDatabaseSearcher + ".delete_searcher(searcher_name = '"+indexClass+"')");
this.indexClasses.remove(indexClass);
} else {
// Regenerate the master "dataset.pkl" and "vectors.pkl" files
StringBuilder updateMasterFilesCommand = new StringBuilder();
updateMasterFilesCommand.append(this.vectorDatabaseSearcher)
.append(".searchers['")
.append(indexClass)
.append("']")
.append(".createMasterFiles(path_to_files = '")
.append(indexDirectory.getParent().toString().replace(FILE_SEPARATOR, DIR_SEPARATOR))
.append("')");
String script = updateMasterFilesCommand.toString();
classLogger.info("Running >>>" + script);
this.pyt.runScript(script);
}
if (ClusterUtil.IS_CLUSTER) {
Thread deleteFilesFromCloudThread = new Thread(new DeleteFilesFromEngineRunner(engineId, this.getCatalogType(), filesToRemoveFromCloud.stream().toArray(String[]::new)));
deleteFilesFromCloudThread.start();
}
}
@SuppressWarnings("unchecked")
@Override
public List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy