All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.hakenadu.javalangchains.chains.data.reader.ReadDocumentsFromPdfChain Maven / Gradle / Ivy

package com.github.hakenadu.javalangchains.chains.data.reader;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.stream.Stream;

import org.apache.pdfbox.Loader;

/**
 * Utilizes Apache PDFBox to read documents from a PDF or a folder of PDFs
 */
public class ReadDocumentsFromPdfChain extends ReadDocumentsFromPdfChainBase {

	/**
	 * creates a {@link ReadDocumentsFromPdfChain}
	 * 
	 * @param readMode {@link #readMode}
	 * @param parallel {@link #parallel}
	 */
	public ReadDocumentsFromPdfChain(final PdfReadMode readMode, final boolean parallel) {
		super(readMode, parallel);
	}

	/**
	 * creates a {@link ReadDocumentsFromPdfChain}
	 * 
	 * @param readMode {@link #readMode}
	 */
	public ReadDocumentsFromPdfChain(final PdfReadMode readMode) {
		this(readMode, false);
	}

	/**
	 * creates a {@link ReadDocumentsFromPdfChain} which reads the whole pdf as a
	 * document
	 */
	public ReadDocumentsFromPdfChain() {
		this(PdfReadMode.WHOLE);
	}

	@Override
	protected Stream loadPdDocuments(final Path input) throws IOException {
		return Files.walk(input).filter(Files::isRegularFile)
				.filter(path -> path.toString().toLowerCase().endsWith(".pdf")).map(path -> {
					try {
						return new PdDocumentWrapper(Loader.loadPDF(path.toFile()), path.getFileName().toString());
					} catch (final IOException ioException) {
						throw new IllegalStateException("could not read document from " + path);
					}
				});
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy