All Downloads are FREE. Search and download functionalities are using the official Maven repository.

dev.langchain4j.data.document.loader.gcs.GoogleCloudStorageDocumentLoader Maven / Gradle / Ivy

There is a newer version: 1.0.0-alpha1
Show newest version
package dev.langchain4j.data.document.loader.gcs;

import com.google.auth.Credentials;
import com.google.cloud.storage.Blob;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageOptions;
import com.google.api.gax.paging.Page;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentLoader;
import dev.langchain4j.data.document.DocumentParser;
import dev.langchain4j.data.document.source.gcs.GcsSource;

import java.util.ArrayList;
import java.util.List;

import static dev.langchain4j.internal.ValidationUtils.ensureNotBlank;
import static dev.langchain4j.internal.ValidationUtils.ensureNotNull;

/**
 * Google Cloud Storage Document Loader to load documents from Google Cloud Storage buckets.
 */
public class GoogleCloudStorageDocumentLoader {

    private final Storage storage;

    private GoogleCloudStorageDocumentLoader(String project, Credentials credentials) {
        StorageOptions.Builder storageBuilder = StorageOptions.newBuilder();

        if (project != null) {
            storageBuilder.setProjectId(ensureNotBlank(project, "project"));
        }

        if (credentials != null) {
            storageBuilder.setCredentials(credentials);
        }

        this.storage = storageBuilder.build().getService();
    }

    /**
     * Loads a single document from the specified Google Cloud Storage bucket based on the specified object key.
     *
     * @param bucket   GCS bucket to load from.
     * @param objectName The key of the GCS object which should be loaded.
     * @param parser   The parser to be used for parsing text from the object.
     * @return A document containing the content of the GCS object.
     */
    public Document loadDocument(String bucket, String objectName, DocumentParser parser) {
        Blob blob = storage.get(bucket, objectName);
        if (blob == null) {
            throw new IllegalArgumentException("Object gs://" + bucket + "/" + objectName + " couldn't be found.");
        }

        GcsSource gcsSource = new GcsSource(blob);
        return DocumentLoader.load(gcsSource, ensureNotNull(parser, "parser"));
    }

    /**
     * Load a list of documents from the specified bucket, filtered with a glob pattern.
     *
     * @param bucket the bucket to load files from
     * @param globPattern filter only files matching the glob pattern, see https://cloud.google.com/storage/docs/json_api/v1/objects/list#list-object-glob
     * @param parser the parser to use to parse the document
     * @return A list of documents from the bucket that match the glob pattern.
     */
    public List loadDocuments(String bucket, String globPattern, DocumentParser parser) {
        Page blobs = globPattern != null ?
            storage.list(bucket, Storage.BlobListOption.currentDirectory(), Storage.BlobListOption.matchGlob(globPattern)) :
            storage.list(bucket, Storage.BlobListOption.currentDirectory());

        List documents = new ArrayList<>();

        for (Blob blob : blobs.iterateAll()) {
            GcsSource gcsSource = new GcsSource(blob);
            documents.add(DocumentLoader.load(gcsSource, ensureNotNull(parser, "parser")));
        }

        return documents;
    }

    /**
     * Loads all documents from an GCS bucket.
     *
     * @param bucket the bucket to load from.
     * @param parser The parser to be used for parsing text from the object.
     * @return A list of documents.
     */
    public List loadDocuments(String bucket, DocumentParser parser) {
        return loadDocuments(bucket, null, parser);
    }

    public static Builder builder() {
        return new Builder();
    }

    public static class Builder {
        private String project;
        private Credentials credentials;

        public Builder project(String project) {
            this.project = project;
            return this;
        }

        public Builder credentials(Credentials credentials) {
            this.credentials = credentials;
            return this;
        }

        public GoogleCloudStorageDocumentLoader build() {
            return new GoogleCloudStorageDocumentLoader(project, credentials);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy