org.jpedal.grouping.PdfGroupingAlgorithms Maven / Gradle / Ivy
Show all versions of OpenViewerFX Show documentation
/*
* ===========================================
* Java Pdf Extraction Decoding Access Library
* ===========================================
*
* Project Info: http://www.idrsolutions.com
* Help section for developers at http://www.idrsolutions.com/support/
*
* (C) Copyright 1997-2017 IDRsolutions and Contributors.
*
* This file is part of JPedal/JPDF2HTML5
*
@LICENSE@
*
* ---------------
* PdfGroupingAlgorithms.java
* ---------------
*/
package org.jpedal.grouping;
import java.awt.Rectangle;
import java.util.*;
import org.jpedal.exception.PdfException;
import org.jpedal.objects.PdfData;
import org.jpedal.utils.repositories.*;
import org.jpedal.utils.repositories.generic.Vector_Rectangle_Int;
/**
* Applies heuristics to unstructured PDF text to create content
*/
public class PdfGroupingAlgorithms {
//marker char used in content (we bury location for each char so we can split)
private static final String MARKER = PdfData.marker;
public static final char MARKER2 = MARKER.charAt(0);
public static boolean useUnrotatedCoords;
//Value placed between result areas to show they are part of the same result
private static final int linkedSearchAreas = -101;
private final PdfSearchUtils searcher;
private final PdfTextExtractionUtils extracter;
/**
* Create a new instance, passing in raw data
*
* @param pdf_data PdfData from the pdf to search
* @param isXMLExtraction Boolean flag to specify if output should be xml
*/
public PdfGroupingAlgorithms(final PdfData pdf_data, final boolean isXMLExtraction) {
searcher = new PdfSearchUtils(pdf_data);
extracter = new PdfTextExtractionUtils(pdf_data, isXMLExtraction);
}
/**
* sets if we include HTML in teasers
* (do we want this is word or this is word as teaser)
*
* @param value true to use HTML in teasers, otherwise false
*/
public void setIncludeHTML(final boolean value) {
searcher.setIncludeHTML(value);
}
/**
* method to show data without encoding
* @param contents contents
* @return String
*/
public static String removeHiddenMarkers(final String contents) {
//trap null
if (contents == null) {
return null;
}
//run though the string extracting our markers
//make sure has markers and ignore if not
if (!contents.contains(MARKER)) {
return contents;
}
//strip the markers
final StringTokenizer tokens = new StringTokenizer(contents, MARKER, true);
String temp_token;
StringBuilder processed_data = new StringBuilder();
//with a token to make sure cleanup works
while (tokens.hasMoreTokens()) {
//encoding in data
temp_token = tokens.nextToken();
//see if first marker
if (temp_token.equals(MARKER)) {
tokens.nextToken(); //point character starts
tokens.nextToken(); //second marker
tokens.nextToken(); //width
tokens.nextToken(); //third marker
//put back chars
processed_data = processed_data.append(tokens.nextToken());
//value
} else {
processed_data = processed_data.append(temp_token);
}
}
return processed_data.toString();
}
/**
* Calls various low level merging routines on merge -
*
* isCSV sets if output is XHTML or CSV format -
*
* XHTML also has options to include font tags (keepFontInfo),
* preserve widths (keepWidthInfo), try to preserve alignment
* (keepAlignmentInfo), and set a table border width (borderWidth)
* - AddCustomTags should always be set to false
*
* @param x1 is the x coord of the top left corner
* @param y1 is the y coord of the top left corner
* @param x2 is the x coord of the bottom right corner
* @param y2 is the y coord of the bottom right corner
* @param pageNumber is the page you wish to extract from
* @param isCSV is a boolean. If false the output is xhtml if true the text is out as CSV
* @param keepFontInfo if true and isCSV is false keeps font information in extrated text.
* @param keepWidthInfo if true and isCSV is false keeps width information in extrated text.
* @param keepAlignmentInfo if true and isCSV is false keeps alignment information in extrated text.
* @param borderWidth is the width of the border for xhtml
* @return Map containing text found in estimated table cells
* @throws PdfException If the co-ordinates are not valid
*/
@SuppressWarnings("UnusedParameters")
public final Map extractTextAsTable(
final int x1,
final int y1,
final int x2,
final int y2,
final int pageNumber,
final boolean isCSV,
final boolean keepFontInfo,
final boolean keepWidthInfo,
final boolean keepAlignmentInfo,
final int borderWidth)
throws PdfException {
return extracter.extractTextAsTable(x1, y1, x2, y2, pageNumber, isCSV, keepFontInfo, keepWidthInfo, keepAlignmentInfo, borderWidth);
}
/**
* Algorithm to place data from within coordinates to a vector of word, word coords (x1,y1,x2,y2)
*
* @param x1 is the x coord of the top left corner
* @param y1 is the y coord of the top left corner
* @param x2 is the x coord of the bottom right corner
* @param y2 is the y coord of the bottom right corner
* @param page_number is the page you wish to extract from
* @param breakFragments will divide up text based on white space characters
* @param punctuation is a string containing all values that should be used to divide up words
* @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...)
* @throws PdfException If the co-ordinates are not valid
*/
@SuppressWarnings("UnusedParameters")
public final List extractTextAsWordlist(
final int x1,
final int y1,
final int x2,
final int y2,
final int page_number,
final boolean breakFragments,
final String punctuation)
throws PdfException {
return extracter.extractTextAsWordlist(x1, y1, x2, y2, page_number, breakFragments, punctuation);
}
/**
* Algorithm to place data from specified coordinates on a page into a String.
*
* @param x1 is the x coord of the top left corner
* @param y1 is the y coord of the top left corner
* @param x2 is the x coord of the bottom right corner
* @param y2 is the y coord of the bottom right corner
* @param page_number is the page you wish to extract from
* @param estimateParagraphs will attempt to find paragraphs and add new lines in output if true
* @param breakFragments will divide up text based on white space characters if true
* @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...)
* @throws PdfException If the co-ordinates are not valid
*/
@SuppressWarnings("UnusedParameters")
public final String extractTextInRectangle(
final int x1,
final int y1,
final int x2,
final int y2,
final int page_number,
final boolean estimateParagraphs,
final boolean breakFragments)
throws PdfException {
return extracter.extractTextInRectangle(x1, y1, x2, y2, page_number, estimateParagraphs, breakFragments);
}
//
/**
* Algorithm to find multiple text terms in x1,y1,x2,y2 rectangle on page_number, with matching teaser.
* The teaser is a section of text that start before the result and ends after, should a teaser not be discovered
* it will instead be set to the search results text.
*
* @param x1 the left x cord
* @param y1 the upper y cord
* @param x2 the right x cord
* @param y2 the lower y cord
* @param rotation the rotation of the page to be searched
* @param terms the terms to search for
* @param searchType searchType the search type made up from one or more constants obtained from the SearchType class
* @param listener an implementation of SearchListener is required, this is to enable searching to be cancelled
* @return a SortedMap containing a collection of Rectangle describing the location of found text, mapped to a String which is the matching teaser
* @throws PdfException If the co-ordinates are not valid
*/
public SortedMap findMultipleTermsInRectangleWithMatchingTeasers(final int x1, final int y1, final int x2, final int y2, final int rotation,
final String[] terms, final int searchType, final SearchListener listener) throws PdfException {
searcher.clearStoredTeasers();
final boolean origIncludeTease = searcher.isGeneratingTeasers();
searcher.generateTeasers(true);
final List highlights = findMultipleTermsInRectangle(x1, y1, x2, y2, terms, searchType, listener);
final SortedMap