com.github.hakenadu.javalangchains.chains.data.reader.ReadDocumentsFromPdfChainBase Maven / Gradle / Ivy
package com.github.hakenadu.javalangchains.chains.data.reader;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import org.apache.logging.log4j.LogManager;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import com.github.hakenadu.javalangchains.chains.Chain;
import com.github.hakenadu.javalangchains.util.PromptConstants;
/**
* provides base functionality for all pdf reading chains
*
* @param input type to read pdfs from
*/
public abstract class ReadDocumentsFromPdfChainBase implements Chain>> {
/**
* this enum is used to configure how each pdf content is read into a string
*/
public enum PdfReadMode {
/**
* Reads the whole document into a string
*/
WHOLE,
/**
* Reads each document page by page: provides a list of documents for each
* document and adds "p. ${pageIndex}" to each "source" field
*/
PAGES;
}
/**
* (PDDocument, PDF-Name) pair
*/
protected class PdDocumentWrapper {
private final PDDocument pdDocument;
private final String pdDocumentName;
/**
* creates an instance of PdDocumentWrapper
*
* @param pdDocument {@link #pdDocument}
* @param pdDocumentName {@link #pdDocumentName}
*/
protected PdDocumentWrapper(final PDDocument pdDocument, final String pdDocumentName) {
this.pdDocument = pdDocument;
this.pdDocumentName = pdDocumentName;
}
}
/**
* @see PdfReadMode
*/
private final PdfReadMode readMode;
/**
* if true
the reading is done in parallel
*/
private final boolean parallel;
/**
* creates a {@link ReadDocumentsFromPdfChainBase}
*
* @param readMode {@link #readMode}
* @param parallel {@link #parallel}
*/
protected ReadDocumentsFromPdfChainBase(final PdfReadMode readMode, final boolean parallel) {
this.readMode = readMode;
this.parallel = parallel;
}
/**
* load a pdf from an input instance
*
* @param input input instance
* @return {@link PDDocument}
*
* @throws IOException on error loading pdf
*/
protected abstract Stream loadPdDocuments(I input) throws IOException;
@Override
public Stream