All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.citec.scie.pdf.DocumentBlockCleaner Maven / Gradle / Ivy

/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */

package de.citec.scie.pdf;

import de.citec.scie.pdf.structure.Document;
import de.citec.scie.pdf.structure.Page;
import de.citec.scie.pdf.structure.TextBlock;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;

/**
 *
 * @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
 */
public class DocumentBlockCleaner {

	public static final int SMALLBLOCKSIZE = 200;
	public static final double REMOVETHRESHOLD = 0.7;

	public DocumentBlockCleaner() {
	}

	/**
	 * The cleanup is done using a greedy heuristic as follows: Start with short
	 * text blocks on the first page and than iterate over all other pages and
	 * try to build a sequence of most similar TextBlocks to it. If we find a
	 * similar TextBlock on all (or at least most) pages, the information is
	 * redundant and we can exclude those TextBlocks from the document.
	 *
	 * @param doc a document.
	 */
	public void blockCleanup(Document doc) {
		final HashMap convertedBlocks = new HashMap<>();
		final ArrayList pagesToRemove = new ArrayList<>();
		for (int pageIdx = 0; pageIdx < doc.content.size(); pageIdx++) {
			//store the blocks that we want to remove on the current page. Otherwise we manipulate a list
			//that we are currently working on, which would be stupid.
			final ArrayList blocksToRemove = new ArrayList<>();
			final List currentBlocks = doc.content.get(pageIdx).content;
			for (final TextBlock block : currentBlocks) {
				final String blockString = block.toString();
				if (blockString.length() < SMALLBLOCKSIZE) {
					//now we have found a fitting block to start with.
					//so iterate through all pages and try to find similar blocks.
					final FittingBlock[] fittingBlocks
							= findBestMatches(doc, blockString, convertedBlocks, pageIdx);
					//then optimize that array and remove outliers from it
					optimize(fittingBlocks);
					//check if the solution is acceptable. It is if the maximum confidence is above
					//REMOVETHRESHOLD
					double maxConfidence = 0;
					for (int i = 0; i < fittingBlocks.length; i++) {
						if (fittingBlocks[i] != null && fittingBlocks[i].getMatchConfidence()
								> maxConfidence) {
							maxConfidence = fittingBlocks[i].getMatchConfidence();
						}
					}
					if (maxConfidence > REMOVETHRESHOLD) {
						for (int i = 0; i < fittingBlocks.length; i++) {
							if (fittingBlocks[i] != null && fittingBlocks[i].getMatch() != null) {
								//remove the blocks.
								doc.content.get(pageIdx + 1 + i).content.remove(
										fittingBlocks[i].getMatch());
							}
						}
						blocksToRemove.add(block);
					}
				}
			}
			for (final TextBlock blockToRemove : blocksToRemove) {
				currentBlocks.remove(blockToRemove);
			}
			if (currentBlocks.isEmpty()) {
				pagesToRemove.add(doc.content.get(pageIdx));
			}
		}
		for (final Page page : pagesToRemove) {
			doc.content.remove(page);
		}

	}

	private FittingBlock[] findBestMatches(Document doc, String blockString,
			HashMap convertedBlocks, int startPageIdx) {
		final StringSimilarity simAlgo = new StringSimilarity();
		final FittingBlock[] fittingBlocks = new FittingBlock[doc.content.size()
				- startPageIdx - 1];
		//iterate through all pages and try to find similar blocks.
		for (int otherPageIdx = startPageIdx + 1; otherPageIdx < doc.content.size();
				otherPageIdx++) {
			//for every page find the best fitting block.
			double maxConfidence = 0;
			TextBlock maxBlock = null;
			for (final TextBlock otherBlock : doc.content.get(otherPageIdx).content) {
				String otherString = convertedBlocks.get(otherBlock);
				if (otherString == null) {
					otherString = otherBlock.toString();
					convertedBlocks.put(otherBlock, otherString);
				}
				if (otherString.length() < SMALLBLOCKSIZE) {
					final double confidence = simAlgo.calculate(blockString, otherString);
					if (confidence > maxConfidence) {
						maxConfidence = confidence;
						maxBlock = otherBlock;
					}
				}
			}
			fittingBlocks[otherPageIdx - startPageIdx - 1] = new FittingBlock(maxConfidence,
					maxBlock);
		}
		return fittingBlocks;
	}

	private void optimize(FittingBlock[] fittingBlocks) {
		//initialize confidence and number of pages.
		double currentConfidence = 1;
		for (final FittingBlock block : fittingBlocks) {
			currentConfidence *= block.getMatchConfidence();
		}
		int currentPages = fittingBlocks.length;
		double optimum = currentPages * currentConfidence;
		//initialize a queue containing the Blocks ordered for their conficence (from lowest to highest)
		final List blockList = new ArrayList<>(Arrays.asList(fittingBlocks));
		Collections.sort(blockList);
		final ArrayDeque blockQueue = new ArrayDeque<>(blockList);
		/*
		 * iteratively remove the block with the lowest conficence from the
		 * queue and check if that improves our criterion: Pages * conficence.
		 * As soon as it does not improve anymore, stop the optimization.
		 */
		while (!blockQueue.isEmpty()) {
			final FittingBlock lowestConfidenceBlock = blockQueue.poll();
			//calculate new optimization criterion value.
			currentConfidence /= lowestConfidenceBlock.getMatchConfidence();
			currentPages--;
			final double current = currentConfidence * currentPages;
			if (current > optimum) {
				optimum = current;
				for (int i = 0; i < fittingBlocks.length; i++) {
					if (lowestConfidenceBlock == fittingBlocks[i]) {
						fittingBlocks[i] = null;
						break;
					}
				}
			} else {
				break;
			}
		}
	}

	private static class FittingBlock implements Comparable {

		private final double matchConfidence;
		private final TextBlock match;

		public FittingBlock(double matchConfidence, TextBlock match) {
			this.matchConfidence = matchConfidence;
			this.match = match;
		}

		public TextBlock getMatch() {
			return match;
		}

		public double getMatchConfidence() {
			return matchConfidence;
		}

		@Override
		public int compareTo(FittingBlock o) {
			return Double.compare(matchConfidence, o.matchConfidence);
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy