All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.openminted.uc.socialsciences.kb.preparation.util.convert.PDFConverter Maven / Gradle / Ivy

The newest version!
package eu.openminted.uc.socialsciences.kb.preparation.util.convert;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Logger;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;

/*
 * This class was used before we decided for a pdf converter. We can get rid of
 * it if we replace all uses with the call to the pdf converter class from
 * ss-io-pdfx-xml module.
 */

public class PDFConverter {

	private static final Logger logger = Logger.getLogger(PDFConverter.class);

	public static String convert(File docFile, Converter converter) {
		return convert(docFile.toPath(), converter);
	}

	public static String convert(Path docPath, Converter converter) {
		Path fileName = docPath.getFileName();
		if (fileName == null || !Files.isRegularFile(docPath)) {
			return null;
		}
		String text = null;

		switch (converter) {
		case GROBID:
			text = convertWithGrobid(docPath);
			break;
		case JPOD:
			text = convertWithJPod(docPath);
			break;
		case PDFBOX:
			text = convertWithPdfBox(docPath);
			break;
		case TIKA:
			text = convertWithTika(docPath);
			break;
		default:
			break;
		}

		return text;
	}

	private static String convertWithTika(Path docPath) {
		// Create a Tika instance with the default configuration
		Tika tika = new Tika();
		tika.setMaxStringLength(-1); // disable max length

		// Parse file
		String text = null;
		try {
			text = tika.parseToString(docPath);
		} catch (IOException | TikaException e) {

		}
		return text;
	}

	private static String convertWithPdfBox(Path docPath) {
		return null;

	}

	private static String convertWithJPod(Path docPath) {
		return null;

	}

	private static String convertWithGrobid(Path docPath) {
		return null;

	}

	public static Map convert(List docPaths, Converter converter) {
		Map result = new HashMap<>();

		for (Path path : docPaths) {
			result.put(path.getFileName().toString(), convert(path, converter));
		}
		return result;
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy