de.citec.scie.pdf.DocumentBlockCleaner Maven / Gradle / Ivy
/*
* SCIE -- Spinal Cord Injury Information Extraction
* Copyright (C) 2013, 2014
* Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package de.citec.scie.pdf;
import de.citec.scie.pdf.structure.Document;
import de.citec.scie.pdf.structure.Page;
import de.citec.scie.pdf.structure.TextBlock;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
/**
*
* @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
*/
public class DocumentBlockCleaner {
public static final int SMALLBLOCKSIZE = 200;
public static final double REMOVETHRESHOLD = 0.7;
public DocumentBlockCleaner() {
}
/**
* The cleanup is done using a greedy heuristic as follows: Start with short
* text blocks on the first page and than iterate over all other pages and
* try to build a sequence of most similar TextBlocks to it. If we find a
* similar TextBlock on all (or at least most) pages, the information is
* redundant and we can exclude those TextBlocks from the document.
*
* @param doc a document.
*/
public void blockCleanup(Document doc) {
final HashMap convertedBlocks = new HashMap<>();
final ArrayList pagesToRemove = new ArrayList<>();
for (int pageIdx = 0; pageIdx < doc.content.size(); pageIdx++) {
//store the blocks that we want to remove on the current page. Otherwise we manipulate a list
//that we are currently working on, which would be stupid.
final ArrayList blocksToRemove = new ArrayList<>();
final List currentBlocks = doc.content.get(pageIdx).content;
for (final TextBlock block : currentBlocks) {
final String blockString = block.toString();
if (blockString.length() < SMALLBLOCKSIZE) {
//now we have found a fitting block to start with.
//so iterate through all pages and try to find similar blocks.
final FittingBlock[] fittingBlocks
= findBestMatches(doc, blockString, convertedBlocks, pageIdx);
//then optimize that array and remove outliers from it
optimize(fittingBlocks);
//check if the solution is acceptable. It is if the maximum confidence is above
//REMOVETHRESHOLD
double maxConfidence = 0;
for (int i = 0; i < fittingBlocks.length; i++) {
if (fittingBlocks[i] != null && fittingBlocks[i].getMatchConfidence()
> maxConfidence) {
maxConfidence = fittingBlocks[i].getMatchConfidence();
}
}
if (maxConfidence > REMOVETHRESHOLD) {
for (int i = 0; i < fittingBlocks.length; i++) {
if (fittingBlocks[i] != null && fittingBlocks[i].getMatch() != null) {
//remove the blocks.
doc.content.get(pageIdx + 1 + i).content.remove(
fittingBlocks[i].getMatch());
}
}
blocksToRemove.add(block);
}
}
}
for (final TextBlock blockToRemove : blocksToRemove) {
currentBlocks.remove(blockToRemove);
}
if (currentBlocks.isEmpty()) {
pagesToRemove.add(doc.content.get(pageIdx));
}
}
for (final Page page : pagesToRemove) {
doc.content.remove(page);
}
}
private FittingBlock[] findBestMatches(Document doc, String blockString,
HashMap convertedBlocks, int startPageIdx) {
final StringSimilarity simAlgo = new StringSimilarity();
final FittingBlock[] fittingBlocks = new FittingBlock[doc.content.size()
- startPageIdx - 1];
//iterate through all pages and try to find similar blocks.
for (int otherPageIdx = startPageIdx + 1; otherPageIdx < doc.content.size();
otherPageIdx++) {
//for every page find the best fitting block.
double maxConfidence = 0;
TextBlock maxBlock = null;
for (final TextBlock otherBlock : doc.content.get(otherPageIdx).content) {
String otherString = convertedBlocks.get(otherBlock);
if (otherString == null) {
otherString = otherBlock.toString();
convertedBlocks.put(otherBlock, otherString);
}
if (otherString.length() < SMALLBLOCKSIZE) {
final double confidence = simAlgo.calculate(blockString, otherString);
if (confidence > maxConfidence) {
maxConfidence = confidence;
maxBlock = otherBlock;
}
}
}
fittingBlocks[otherPageIdx - startPageIdx - 1] = new FittingBlock(maxConfidence,
maxBlock);
}
return fittingBlocks;
}
private void optimize(FittingBlock[] fittingBlocks) {
//initialize confidence and number of pages.
double currentConfidence = 1;
for (final FittingBlock block : fittingBlocks) {
currentConfidence *= block.getMatchConfidence();
}
int currentPages = fittingBlocks.length;
double optimum = currentPages * currentConfidence;
//initialize a queue containing the Blocks ordered for their conficence (from lowest to highest)
final List blockList = new ArrayList<>(Arrays.asList(fittingBlocks));
Collections.sort(blockList);
final ArrayDeque blockQueue = new ArrayDeque<>(blockList);
/*
* iteratively remove the block with the lowest conficence from the
* queue and check if that improves our criterion: Pages * conficence.
* As soon as it does not improve anymore, stop the optimization.
*/
while (!blockQueue.isEmpty()) {
final FittingBlock lowestConfidenceBlock = blockQueue.poll();
//calculate new optimization criterion value.
currentConfidence /= lowestConfidenceBlock.getMatchConfidence();
currentPages--;
final double current = currentConfidence * currentPages;
if (current > optimum) {
optimum = current;
for (int i = 0; i < fittingBlocks.length; i++) {
if (lowestConfidenceBlock == fittingBlocks[i]) {
fittingBlocks[i] = null;
break;
}
}
} else {
break;
}
}
}
private static class FittingBlock implements Comparable {
private final double matchConfidence;
private final TextBlock match;
public FittingBlock(double matchConfidence, TextBlock match) {
this.matchConfidence = matchConfidence;
this.match = match;
}
public TextBlock getMatch() {
return match;
}
public double getMatchConfidence() {
return matchConfidence;
}
@Override
public int compareTo(FittingBlock o) {
return Double.compare(matchConfidence, o.matchConfidence);
}
}
}