All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.openminted.uc.socialsciences.kb.preparation.util.input.GoldDataReader Maven / Gradle / Ivy

The newest version!
package eu.openminted.uc.socialsciences.kb.preparation.util.input;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import eu.openminted.uc.socialsciences.kb.preparation.util.output.DBManager;

/**
 * Gold data is in xlsx format. This class reads the data from the tables and
 * stores them in an appropriate format for later evaluation.
 *
 * @author neumanmy
 *
 */
public class GoldDataReader {

	private List toProcess;

	private DBManager writer;

	private static int VARIABLE;
	private static int PAPER = 4;
	private static int REFERENCE = 3;

	public GoldDataReader(Path root) {
		toProcess = new ArrayList<>();
		setRootDir(root);
	}

	private void setRootDir(Path root) {
		try {
			Files.walk(root).filter(Files::isRegularFile).filter(path -> path.toString().endsWith(".xlsx"))
					.forEach(toProcess::add);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public void readData(DBManager dbManager) {
		this.writer = dbManager;

		for (Path path : toProcess) {
			System.out.println("Reading from path " + path);
			readData(path);
		}
	}

	private void readData(Path file) {
		try (XSSFWorkbook wb = new XSSFWorkbook(Files.newInputStream(file))) {
			for (int i = 0; i < wb.getNumberOfSheets(); i++) {
				processSheet(wb.getSheetAt(i));
			}
		} catch (IOException e1) {
			e1.printStackTrace();
		}
	}

	private void processSheet(XSSFSheet sheet) {
		String sheetName = sheet.getSheetName();
		System.out.println("Processing sheet: " + sheetName);

		String datasetID = sheetName;

		Cell varCell, paperCell, refCell;
		Row row;

		String varRef = null, refText, paperRef;

		setLabels(sheet.getRow(0));

		for (int i = 1; i < sheet.getPhysicalNumberOfRows(); i++) {
			row = sheet.getRow(i);

			varCell = row.getCell(VARIABLE, Row.CREATE_NULL_AS_BLANK);
			paperCell = row.getCell(PAPER, Row.RETURN_BLANK_AS_NULL);
			refCell = row.getCell(REFERENCE, Row.RETURN_BLANK_AS_NULL);

			if (!(varCell.getCellType() == Cell.CELL_TYPE_BLANK)) {
				varRef = varCell.getStringCellValue();
			}

			refText = null == refCell ? null : refCell.getStringCellValue();
			paperRef = paperCell.getStringCellValue();

			writer.writeReference(varRef, paperRef, datasetID, refText);
		}
	}

	private static void setLabels(Row row) {
		for (int i = 0; i < row.getPhysicalNumberOfCells(); i++) {
			switch (row.getCell(i).getStringCellValue()) {
			case "Variable":
				VARIABLE = i;
				break;
			case "Paper":
				PAPER = i;
				break;
			case "Reference":
				REFERENCE = i;
				break;
			}
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy