org.jpedal.grouping.PdfGroupingAlgorithms Maven / Gradle / Ivy

/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/support/
 *
 * (C) Copyright 1997-2017 IDRsolutions and Contributors.
 *
 * This file is part of JPedal/JPDF2HTML5
 *
 @LICENSE@
 *
 * ---------------
 * PdfGroupingAlgorithms.java
 * ---------------
 */
package org.jpedal.grouping;

import java.awt.Rectangle;
import java.util.*;

import org.jpedal.exception.PdfException;
import org.jpedal.objects.PdfData;
import org.jpedal.utils.repositories.*;
import org.jpedal.utils.repositories.generic.Vector_Rectangle_Int;

/**
 * Applies heuristics to unstructured PDF text to create content
 */
public class PdfGroupingAlgorithms {

    //marker char used in content (we bury location for each char so we can split)
    private static final String MARKER = PdfData.marker;
    public static final char MARKER2 = MARKER.charAt(0);

    public static boolean useUnrotatedCoords;

    //Value placed between result areas to show they are part of the same result
    private static final int linkedSearchAreas = -101;

    private final PdfSearchUtils searcher;
    private final PdfTextExtractionUtils extracter;

    /**
     * Create a new instance, passing in raw data
     *
     * @param pdf_data        PdfData from the pdf to search
     * @param isXMLExtraction Boolean flag to specify if output should be xml
     */
    public PdfGroupingAlgorithms(final PdfData pdf_data, final boolean isXMLExtraction) {
        searcher = new PdfSearchUtils(pdf_data);
        extracter = new PdfTextExtractionUtils(pdf_data, isXMLExtraction);
    }

    /**
     * sets if we include HTML in teasers
     * (do we want this is word or this is word as teaser)
     *
     * @param value true to use HTML in teasers, otherwise false
     */
    public void setIncludeHTML(final boolean value) {

        searcher.setIncludeHTML(value);

    }

    /**
     * method to show data without encoding
     * @param contents contents
     * @return String
     */
    public static String removeHiddenMarkers(final String contents) {

        //trap null
        if (contents == null) {
            return null;
        }

        //run though the string extracting our markers

        //make sure has markers and ignore if not
        if (!contents.contains(MARKER)) {
            return contents;
        }

        //strip the markers
        final StringTokenizer tokens = new StringTokenizer(contents, MARKER, true);
        String temp_token;
        StringBuilder processed_data = new StringBuilder();

        //with a token to make sure cleanup works
        while (tokens.hasMoreTokens()) {

            //encoding in data
            temp_token = tokens.nextToken();

            //see if first marker
            if (temp_token.equals(MARKER)) {
                tokens.nextToken(); //point character starts
                tokens.nextToken(); //second marker
                tokens.nextToken(); //width
                tokens.nextToken(); //third marker

                //put back chars
                processed_data = processed_data.append(tokens.nextToken());
                //value
            } else {
                processed_data = processed_data.append(temp_token);
            }
        }
        return processed_data.toString();
    }

    /**
     * Calls various low level merging routines on merge -
     * 
     * isCSV sets if output is XHTML or CSV format -
     * 

     * XHTML also has options to include font tags (keepFontInfo),
     * preserve widths (keepWidthInfo), try to preserve alignment
     * (keepAlignmentInfo), and set a table border width (borderWidth)
     * - AddCustomTags should always be set to false
     *
     * @param x1                is the x coord of the top left corner
     * @param y1                is the y coord of the top left corner
     * @param x2                is the x coord of the bottom right corner
     * @param y2                is the y coord of the bottom right corner
     * @param pageNumber        is the page you wish to extract from
     * @param isCSV             is a boolean. If false the output is xhtml if true the text is out as CSV
     * @param keepFontInfo      if true and isCSV is false keeps font information in extrated text.
     * @param keepWidthInfo     if true and isCSV is false keeps width information in extrated text.
     * @param keepAlignmentInfo if true and isCSV is false keeps alignment information in extrated text.
     * @param borderWidth       is the width of the border for xhtml
     * @return Map containing text found in estimated table cells
     * @throws PdfException If the co-ordinates are not valid
     */
    @SuppressWarnings("UnusedParameters")
    public final Map extractTextAsTable(
            final int x1,
            final int y1,
            final int x2,
            final int y2,
            final int pageNumber,
            final boolean isCSV,
            final boolean keepFontInfo,
            final boolean keepWidthInfo,
            final boolean keepAlignmentInfo,
            final int borderWidth)
            throws PdfException {

        return extracter.extractTextAsTable(x1, y1, x2, y2, pageNumber, isCSV, keepFontInfo, keepWidthInfo, keepAlignmentInfo, borderWidth);

    }

    /**
     * Algorithm to place data from within coordinates to a vector of word, word coords (x1,y1,x2,y2)
     *
     * @param x1             is the x coord of the top left corner
     * @param y1             is the y coord of the top left corner
     * @param x2             is the x coord of the bottom right corner
     * @param y2             is the y coord of the bottom right corner
     * @param page_number    is the page you wish to extract from
     * @param breakFragments will divide up text based on white space characters
     * @param punctuation    is a string containing all values that should be used to divide up words
     * @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...)
     * @throws PdfException If the co-ordinates are not valid
     */
    @SuppressWarnings("UnusedParameters")
    public final List extractTextAsWordlist(
            final int x1,
            final int y1,
            final int x2,
            final int y2,
            final int page_number,
            final boolean breakFragments,
            final String punctuation)
            throws PdfException {

        return extracter.extractTextAsWordlist(x1, y1, x2, y2, page_number, breakFragments, punctuation);
    }

    /**
     * Algorithm to place data from specified coordinates on a page into a String.
     *
     * @param x1                 is the x coord of the top left corner
     * @param y1                 is the y coord of the top left corner
     * @param x2                 is the x coord of the bottom right corner
     * @param y2                 is the y coord of the bottom right corner
     * @param page_number        is the page you wish to extract from
     * @param estimateParagraphs will attempt to find paragraphs and add new lines in output if true
     * @param breakFragments     will divide up text based on white space characters if true
     * @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...)
     * @throws PdfException If the co-ordinates are not valid
     */
    @SuppressWarnings("UnusedParameters")
    public final String extractTextInRectangle(
            final int x1,
            final int y1,
            final int x2,
            final int y2,
            final int page_number,
            final boolean estimateParagraphs,
            final boolean breakFragments)
            throws PdfException {

        return extracter.extractTextInRectangle(x1, y1, x2, y2, page_number, estimateParagraphs, breakFragments);

    }

    //

    /**
     * Algorithm to find multiple text terms in x1,y1,x2,y2 rectangle on page_number, with matching teaser.
     * The teaser is a section of text that start before the result and ends after, should a teaser not be discovered
     * it will instead be set to the search results text.
     *
     * @param x1         the left x cord
     * @param y1         the upper y cord
     * @param x2         the right x cord
     * @param y2         the lower y cord
     * @param rotation   the rotation of the page to be searched
     * @param terms      the terms to search for
     * @param searchType searchType the search type made up from one or more constants obtained from the SearchType class
     * @param listener   an implementation of SearchListener is required, this is to enable searching to be cancelled
     * @return a SortedMap containing a collection of Rectangle describing the location of found text, mapped to a String which is the matching teaser
     * @throws PdfException If the co-ordinates are not valid
     */
    public SortedMap findMultipleTermsInRectangleWithMatchingTeasers(final int x1, final int y1, final int x2, final int y2, final int rotation,
                                                                     final String[] terms, final int searchType, final SearchListener listener) throws PdfException {
        searcher.clearStoredTeasers();

        final boolean origIncludeTease = searcher.isGeneratingTeasers();
        searcher.generateTeasers(true);

        final List highlights = findMultipleTermsInRectangle(x1, y1, x2, y2, terms, searchType, listener);

        final SortedMap highlightsWithTeasers = new TreeMap(new PdfTextExtractionUtils.ResultsComparatorRectangle(rotation));

        final String[] teasers = searcher.getTeasers();
        for (int i = 0; i < highlights.size(); i++) {
            //highlights.get(i) is a rectangle or a rectangle[]
            highlightsWithTeasers.put(highlights.get(i), teasers[i]);
        }

        searcher.generateTeasers(origIncludeTease);

        return highlightsWithTeasers;
    }

    /**
     * Method to search a specified area on a specified page for a search term.
     * The returned map contains a set of coordinate for found values and a teaser.
     * The teaser is a section of text that start before the result and ends after,
     * should a teaser not be discovered it will instead be set to the search results text.
     *
     * @param x1         the left x cord
     * @param y1         the upper y cord
     * @param x2         the right x cord
     * @param y2         the lower y cord
     * @param rotation   the rotation of the page to be searched
     * @param terms      the terms to search for
     * @param searchType searchType the search type made up from one or more constants obtained from the SearchType class
     * @param listener   an implementation of SearchListener is required, this is to enable searching to be cancelled
     * @return a SortedMap containing an int[] of coordinates as the key and a String teaser as the value
     * @throws PdfException If the co-ordinates are not valid
     */
    public SortedMap findTextWithinInAreaWithTeasers(final int x1, final int y1, final int x2, final int y2, final int rotation,
                                                     final String[] terms, final int searchType, final SearchListener listener) throws PdfException {

        searcher.clearStoredTeasers();

        final boolean origIncludeTease = searcher.isGeneratingTeasers();
        searcher.generateTeasers(true);

        final List highlights = findTextWithinArea(x1, y1, x2, y2, terms, searchType, listener);

        final SortedMap highlightsWithTeasers = new TreeMap(new PdfTextExtractionUtils.ResultsComparator(rotation));

        final String[] teasers = searcher.getTeasers();
        for (int i = 0; i < highlights.size(); i++) {
            //highlights.get(i) is a rectangle or a rectangle[]
            highlightsWithTeasers.put(highlights.get(i), teasers[i]);
        }

        searcher.generateTeasers(origIncludeTease);

        return highlightsWithTeasers;
    }

    //

    /**
     * Algorithm to find multiple text terms in x1,y1,x2,y2 rectangle on page_number.
     *
     * @param x1           the left x cord
     * @param y1           the upper y cord
     * @param x2           the right x cord
     * @param y2           the lower y cord
     * @param rotation     the rotation of the page to be searched
     * @param terms        the terms to search for
     * @param orderResults if true the list that is returned is ordered to return the resulting rectangles in a
     *                     logical order descending down the page, if false, rectangles for multiple terms are grouped together.
     * @param searchType   searchType the search type made up from one or more constants obtained from the SearchType class
     * @param listener     an implementation of SearchListener is required, this is to enable searching to be cancelled
     * @return a list of Rectangle describing the location of found text
     * @throws PdfException If the co-ordinates are not valid
     */
    public List findMultipleTermsInRectangle(final int x1, final int y1, final int x2, final int y2, final int rotation,
                                             final String[] terms, final boolean orderResults, final int searchType, final SearchListener listener) throws PdfException {

        searcher.clearStoredTeasers();

        final List