org.jpedal.grouping.PdfSearchUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of OpenViewerFX Show documentation
Open Source (LGPL) JavaFX PDF Viewer for NetBeans plugin
There is a newer version: 7.15.25
/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/support/
 *
 * (C) Copyright 1997-2017 IDRsolutions and Contributors.
 *
 * This file is part of JPedal/JPDF2HTML5
 *
 @LICENSE@
 *
 * ---------------
 * PdfSearchUtils.java
 * ---------------
 */
package org.jpedal.grouping;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jpedal.exception.PdfException;

import static org.jpedal.grouping.PdfGroupingAlgorithms.removeHiddenMarkers;

import org.jpedal.objects.PdfData;
import org.jpedal.utils.Fonts;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Strip;
import org.jpedal.utils.repositories.Vector_Float;
import org.jpedal.utils.repositories.Vector_String;

public class PdfSearchUtils {

    private boolean includeHTMLtags;

    private final List multipleTermTeasers = new ArrayList();

    //Hold data from pdf so we can create local version
    private final PdfData pdf_data;

    private Line[] fragments;
    private Line[] lines;

    //Value placed between result areas to show they are part of the same result
    private static final int MULTIPLE_AREA_RESULT = -101;

    private boolean includeTease;

    protected PdfSearchUtils(final PdfData pdf_data) {
        this.pdf_data = pdf_data;
    }

    /**
     * Search a particular area with in pdf page currently loaded and return the areas
     * of the results found as an array of float values.
     *
     * @param x1         is the x coord of the top left corner
     * @param y1         is the y coord of the top left corner
     * @param x2         is the x coord of the bottom right corner
     * @param y2         is the y coord of the bottom right corner
     * @param terms      : String[] of search terms, each String is treated as a single term
     * @param searchType : int containing bit flags for the search (See class SearchType)
     * @return the coords of the found text in a float[] where the coords are pdf page coords.
     * The origin of the coords is the bottom left hand corner (on unrotated page) organised in the following order.

     * [0]=result x1 coord

     * [1]=result y1 coord

     * [2]=result x2 coord

     * [3]=result y2 coord

     * [4]=either -101 to show that the next text area is the remainder of this word on another line else any other value is ignored.

     * @throws PdfException if the page content being search contains invalid data that the search can not recover from
     */
    @SuppressWarnings("UnusedParameters")
    protected final float[] findText(
            int x1,
            int y1,
            int x2,
            int y2,
            final String[] terms,
            final int searchType)
            throws PdfException {

        //Failed to supply search terms to do nothing
        if (terms == null) {
            return new float[]{};
        }

        //Search result and teaser holders
        final Vector_Float resultCoords = new Vector_Float(0);
        final Vector_String resultTeasers = new Vector_String(0);

        //make sure co-ords valid and throw exception if not
        final int[] v = validateCoordinates(x1, y1, x2, y2);
        x1 = v[0];
        y1 = v[1];
        x2 = v[2];
        y2 = v[3];

        //Extract the text data into local arrays for searching
        copyToArraysPartial(x1, y2, x2, y1);

        //Remove any hidden text on page as should not be found
        cleanupShadowsAndDrownedObjects(false);

        //Get unused text objects and sort them for correct searching
        final Line[] localLines = fragments.clone();

        final int[] unsorted = getWritingModeCounts(localLines);
        final int[] writingModes = getWritingModeOrder(unsorted);

        for (int u = 0; u != writingModes.length; u++) {

            final int mode = writingModes[u];

            //if not lines for writing mode, ignore
            if (unsorted[mode] != 0) {
                searchWritingMode(mode, searchType, terms, resultCoords, resultTeasers);
            }

        }

        //Return coord data for search results
        return resultCoords.get();

    }

    /**
     * return text teasers from findtext if generateTeasers() called before find
     *
     * @return String[] representing teasers for each result (single of linked areas) in result order
     */
    protected String[] getTeasers() {
        return multipleTermTeasers.toArray(new String[multipleTermTeasers.size()]);
    }

    /**
     * put raw data into Arrays for quick merging breakup_fragments shows if we
     * break on vertical lines and spaces
     */
    private void copyToArraysPartial(final int minX, final int minY, final int maxX, final int maxY) {

        final int count = pdf_data.getRawTextElementCount();

        final Line[] localFragments = new Line[count];

        int currentPoint = 0;

        final String marker = PdfData.marker;

        //set values
        for (int i = 0; i < count; i++) {

            //if at least partly in the area, process
            if (isFragmentWithinArea(pdf_data, i, minX, minY, maxX, maxY)) {

                final int mode = pdf_data.f_writingMode[i];

                localFragments[currentPoint] = new Line(pdf_data, i);

                final StringBuilder startTags = new StringBuilder(localFragments[currentPoint].getRawData().substring(0, localFragments[currentPoint].getRawData().indexOf(marker)));
                final String contentText = localFragments[currentPoint].getRawData().substring(localFragments[currentPoint].getRawData().indexOf(marker), localFragments[currentPoint].getRawData().indexOf('<', localFragments[currentPoint].getRawData().lastIndexOf(marker)));
                String endTags = localFragments[currentPoint].getRawData().substring(localFragments[currentPoint].getRawData().lastIndexOf(marker));
                //Skips last section of text
                endTags = endTags.substring(endTags.indexOf('<'));

                final StringTokenizer tokenizer = new StringTokenizer(contentText, marker);
                boolean setX1 = true;
                float width = 0;

                while (tokenizer.hasMoreTokens()) {

                    String token = tokenizer.nextToken();
                    final float xCoord = (Float.parseFloat(token));

                    token = tokenizer.nextToken();
                    width = Float.parseFloat(token);

                    token = tokenizer.nextToken();
                    final String character = token;

                    if (setX1) {
                        if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) {
                            localFragments[currentPoint].setX1(xCoord);
                        } else {
                            localFragments[currentPoint].setY2(xCoord);
                        }
                        setX1 = false;
                    }

                    if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) {
                        localFragments[currentPoint].setX2(xCoord);
                    } else {
                        localFragments[currentPoint].setY1(xCoord);
                    }

                    boolean storeValues = false;
                    if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) {
                        if (minX < xCoord && (xCoord + width) < maxX) {
                            storeValues = true;
                        }
                    } else {
                        if (minY < xCoord && (xCoord + width) < maxY) {
                            storeValues = true;
                        }
                    }
                    if (storeValues) {
                        startTags.append(marker);
                        startTags.append(xCoord); //Add X Coord

                        startTags.append(marker);
                        startTags.append(width); //Add Width

                        startTags.append(marker);
                        startTags.append(character); //Add Letter


                    }

                }

                localFragments[currentPoint].setRawData(startTags.append(endTags).toString());

                if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) {
                    localFragments[currentPoint].setX2(localFragments[currentPoint].getX2() + width);
                } else {
                    localFragments[currentPoint].setY1(localFragments[currentPoint].getY1() + width);
                }

                currentPoint++;
            }
        }

        fragments = new Line[currentPoint];

        for (int i = 0; i != currentPoint; i++) {
            fragments[i] = localFragments[i];
        }
    }

    private static boolean isFragmentWithinArea(final PdfData pdf_data, final int i, final int minX, final int minY, final int maxX, final int maxY) {

        //extract values
        final float x1 = pdf_data.f_x1[i];
        final float x2 = pdf_data.f_x2[i];
        final float y1 = pdf_data.f_y1[i];
        final float y2 = pdf_data.f_y2[i];
        final int mode = pdf_data.f_writingMode[i];

        final float height;

        switch (mode) {
            case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
            case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                height = y1 - y2;
                if ((((minX < x1 && x1 < maxX) || (minX < x2 && x2 < maxX)) || //Area contains the x1 or x2 coords
                        ((x1 < minX && minX < x2) || (x1 < maxX && maxX < x2)) //Area is within the x1 and x2 coords
                )
                        && (minY < y2 + (height / 4) && y2 + (height * 0.75) < maxY) //Area also contains atleast 3/4 of the text y coords
                        ) {
                    return true;
                }
                break;
            case PdfData.VERTICAL_BOTTOM_TO_TOP:
            case PdfData.VERTICAL_TOP_TO_BOTTOM:
                height = x2 - x1;
                if ((((minY < y1 && y1 < maxY) || (minY < y2 && y2 < maxY)) || //Area contains the x1 or x2 coords
                        ((y2 < minY && minY < y1) || (y2 < maxY && maxY < y1)) //Area is within the x1 and x2 coords
                )
                        && (minX < x1 + (height / 4) && x1 + (height * 0.75) < maxX) //Area also contains atleast 3/4 of the text y coords
                        ) {
                    return true;
                }
                break;
        }
        return false;
    }

    /**
     * make sure co-ords valid and throw exception if not
     */
    private static int[] validateCoordinates(int x1, int y1, int x2, int y2) {
        if ((x1 > x2) | (y1 < y2)) {
            if (x1 > x2) {
                final int temp = x1;
                x1 = x2;
                x2 = temp;
                LogWriter.writeLog("x1 > x2, coordinates were swapped to validate");
            }

            if (y1 < y2) {
                final int temp = y1;
                y1 = y2;
                y2 = temp;
                LogWriter.writeLog("y1 < y2, coordinates were swapped to validate");
            }
        }
        return new int[]{x1, y1, x2, y2};
    }

    //

    /**
     * Search with in pdf page currently loaded and return the areas
     * of the results found as an array of float values.
     * 
     * Method to find text in the specified area allowing for the text to be split across multiple lines.

     *
     * @param terms      = the text to search for
     * @param searchType = info on how to search the pdf
     * @return the coords of the found text in a float[] where the coords are pdf page coords.
     * The origin of the coords is the bottom left hand corner (on unrotated page) organised in the following order.

     * [0]=result x1 coord

     * [1]=result y1 coord

     * [2]=result x2 coord

     * [3]=result y2 coord

     * [4]=either -101 to show that the next text area is the remainder of this word on another line else any other value is ignored.

     * @throws PdfException if the page content being search contains invalid data that the search can not recover from
     */
    protected final float[] findText(
            final String[] terms,
            final int searchType)
            throws PdfException {

        //Failed to supply search terms to do nothing
        if (terms == null) {
            return new float[]{};
        }

        //Search result and teaser holders
        final Vector_Float resultCoords = new Vector_Float(0);
        final Vector_String resultTeasers = new Vector_String(0);

        //Extract the text data into local arrays for searching
        copyToArrays();

        //Remove any hidden text on page as should not be found
        cleanupShadowsAndDrownedObjects(false);

        //Get unused text objects and sort them for correct searching
//		final int[] items = getsortedUnusedFragments(true, false);
        final Line[] localLines = fragments.clone();

        final int[] unsorted = getWritingModeCounts(localLines);
        final int[] writingModes = getWritingModeOrder(unsorted);

        for (int u = 0; u != writingModes.length; u++) {

            final int mode = writingModes[u];

            if (unsorted[mode] != 0) {
                searchWritingMode(mode, searchType, terms, resultCoords, resultTeasers);
            }
        }
        //Return coord data for search results
        return resultCoords.get();

    }


    private void searchWritingMode(final int mode, final int searchType, final String[] terms, final Vector_Float resultCoords, final Vector_String resultTeasers) throws PdfException {

        //Flags to control the different search options
        boolean firstOccuranceOnly = false;
        boolean wholeWordsOnly = false;
        boolean foundFirst = false;
        boolean useRegEx = false;

        //Merge text localFragments into lines as displayed on page
        createLinesForSearch(mode, false, false, true);

        //Bitwise flags for regular expressions engine, options always required 
        final int options = loadSearcherOptions(searchType);

        //Only find first occurance of each search term
        if ((searchType & SearchType.FIND_FIRST_OCCURANCE_ONLY) == SearchType.FIND_FIRST_OCCURANCE_ONLY) {
            firstOccuranceOnly = true;
        }

        //Only find whole words, not partial words
        if ((searchType & SearchType.WHOLE_WORDS_ONLY) == SearchType.WHOLE_WORDS_ONLY) {
            wholeWordsOnly = true;
        }

        //Allow the use of regular expressions symbols
        if ((searchType & SearchType.USE_REGULAR_EXPRESSIONS) == SearchType.USE_REGULAR_EXPRESSIONS) {
            useRegEx = true;
        }

        //Check if coords need swapping
        final boolean valuesSwapped = (mode == PdfData.VERTICAL_BOTTOM_TO_TOP || mode == PdfData.VERTICAL_TOP_TO_BOTTOM);

        //Portions of text to perform the search on and find teasers
        final String searchText = buildSearchText(false, mode);
        final String coordsText = buildSearchText(true, mode);

        //Hold starting point data at page rotation
        int[] resultStart;

        //Work through the search terms one at a time
        for (int j = 0; j != terms.length; j++) {

            String searchValue = alterStringTooDisplayOrder(terms[j]);

            //Set the default separator between words in a search term
            String sep = " ";

            //Multiline needs space or newline to be recognised as word separators
            if ((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS) {
                sep = "[ \\\\n]+";
            }

            //if not using reg ex add reg ex literal flags around the text and word separators
            if (!useRegEx) {
                searchValue = "\\Q" + searchValue + "\\E";
                sep = "\\\\E" + sep + "\\\\Q";
            }

            //If word seperator has changed, replace all spaces with modified seperator
            if (!sep.equals(" ")) {
                searchValue = searchValue.replaceAll(" ", sep);
            }

            //Surround search term with word boundry tags to match whole words
            if (wholeWordsOnly) {
                searchValue = "\\b" + searchValue + "\\b";
            }

            //Create pattern to match search term
            final Pattern searchTerm = Pattern.compile(searchValue, options);

            //Create pattern to match search term with two words before and after
            final Pattern teaserTerm = Pattern.compile("(?:\\S+\\s)?\\S*(?:\\S+\\s)?\\S*" + searchValue + "\\S*(?:\\s\\S+)?\\S*(?:\\s\\S+)?", options);

            //So long as text data is not null
            if (searchText != null) {

                //Create two matchers for finding search term and teaser
                final Matcher termFinder = searchTerm.matcher(searchText);
                final Matcher teaserFinder = teaserTerm.matcher(searchText);
                final boolean needToFindTeaser = true;

                //Keep looping till no result is returned
                while (termFinder.find()) {
                    resultStart = null;
                    //Make note of the text found and index in the text
                    String foundTerm = termFinder.group();
                    final int termStarts = termFinder.start();
                    final int termEnds = termFinder.end() - 1;

                    //If storing teasers
                    if (includeTease) {

                        if (includeHTMLtags) {
                            foundTerm = "" + foundTerm + "";
                        }

                        if (needToFindTeaser) {
                            findTeaser(foundTerm, teaserFinder, termStarts, termEnds, resultTeasers);
                        }
                    }

                    getResultCoords(coordsText, mode, resultStart, termStarts, termEnds, valuesSwapped, resultCoords);

                    //If only finding first occurance,
                    //Stop searching this text data for search term.
                    if (firstOccuranceOnly) {
                        foundFirst = true;
                        break;
                    }
                }

                //If only finding first occurance and first is found,
                //Stop searching all text data for this search term.
                if (firstOccuranceOnly && foundFirst) {
                    break;
                }
            }
        }

        //Remove any trailing empty values
        resultCoords.trim();

        //If including tease values
        if (includeTease) {
            storeTeasers(resultTeasers);
        }

    }


    private void getResultCoords(final String coordText, final int mode, int[] resultStart, int termStarts, final int termEnds, final boolean valuesSwapped, final Vector_Float resultCoords) {

        //Get coords of found text for highlights
        float currentX;
        float width;

        final char MARKER2 = PdfGroupingAlgorithms.MARKER2;

        //Track point in text data line (without coord data)
        int pointInLine = -1;

        //Track line on page
        int lineCounter = 0;

        //Skip null values and value not in the correct writing mode to ensure correct result coords
        while (lines[lineCounter].getRawData() == null ||
                Strip.stripXML(lines[lineCounter].getRawData(), true).toString().isEmpty() ||
                mode != lines[lineCounter].getWritingMode()) {
            lineCounter++;
        }

        //Flags used to catch if result is split accross lines
        boolean startFound = false;
        boolean endFound = false;

        //Cycle through coord text looking for coords of this result
        //Ignore first value as it is known to be the first marker
        for (int pointer = 1; pointer < coordText.length(); pointer++) {

            // find second marker and get x coord
            int startPointer = pointer;
            while (pointer < coordText.length()) {
                if (coordText.charAt(pointer) == MARKER2) {
                    break;
                }
                pointer++;
            }

            //Convert text to float value for x coord
            currentX = Float.parseFloat(coordText.substring(startPointer, pointer));
            pointer++;

            // find third marker and get width
            startPointer = pointer;
            while (pointer < coordText.length()) {
                if (coordText.charAt(pointer) == MARKER2) {
                    break;
                }

                pointer++;
            }

            //Convert text to float value for character width
            width = Float.parseFloat(coordText.substring(startPointer, pointer));
            pointer++;

            // find fourth marker and get text (character)
            startPointer = pointer;
            while (pointer < coordText.length()) {
                if (coordText.charAt(pointer) == MARKER2) {
                    break;
                }

                pointer++;
            }

            //Store text to check for newline character later
            final String text = coordText.substring(startPointer, pointer);
            pointInLine += text.length();

            //Start of term not found yet.
            //Point in line is equal to or greater than start of the term.
            //Store coords and mark start as found.
            if (!startFound && pointInLine >= termStarts) {
                int currentY = (int) lines[lineCounter].getY1();
                if (valuesSwapped) {
                    currentY = (int) lines[lineCounter].getX2();
                }
                resultStart = new int[]{(int) currentX, currentY};
                startFound = true;
            }

            //End of term not found yet.
            //Point in line is equal to or greater than end of the term.
            //Store coords and mark end as found.
            if (!endFound && pointInLine >= termEnds) {
                int currentY = (int) lines[lineCounter].getY2();
                if (valuesSwapped) {
                    currentY = (int) lines[lineCounter].getX1();
                }
                storeResultsCoords(valuesSwapped, mode, resultCoords, resultStart[0], resultStart[1], (currentX + width), currentY, 0.0f);

                endFound = true;
            }

            //Using multi line option.
            //Start of term found.
            //End of term not found.
            //New line character found.
            //Set up multi line result.
            if (startFound && !endFound && text.contains("\n")) {

                storeResultsCoords(valuesSwapped, mode, resultCoords, resultStart[0], resultStart[1], (currentX + width), lines[lineCounter].getY2(), MULTIPLE_AREA_RESULT);

                //Set start of term as not found
                startFound = false;

                //Set this point in line as start of next term
                //Guarantees next character is found as 
                //start of the next part of the search term
                termStarts = pointInLine;
            }

            //In multiline mode we progress the line number when we find a \n
            //This is to allow the correct calculation of y coords
            if (text.contains("\n")) {
                lineCounter++;

                //If current content pointed at is null or not the correct writing mode, skip value until data is found
                while (lineCounter < lines.length && (lines[lineCounter].getRawData() == null ||
                        Strip.stripXML(lines[lineCounter].getRawData(), true).toString().isEmpty() ||
                        mode != lines[lineCounter].getWritingMode())) {
                    lineCounter++;
                }
            }

        }
    }

    protected void clearStoredTeasers() {
        multipleTermTeasers.clear();
    }

    private void storeTeasers(final Vector_String resultTeasers) {

        //Remove any trailing empty values
        resultTeasers.trim();
        final String[] results = resultTeasers.get();
        for (int i = 0; i != results.length; i++) {
            multipleTermTeasers.add(results[i]);
        }

        //Prevent issue this not getting cleared between writing modes 
        //resulting in duplicate teasers
        resultTeasers.clear();
    }


    private static void storeResultsCoords(final boolean valuesSwapped, final int mode, final Vector_Float resultCoords, final float x1, final float y1, final float x2, final float y2, final float connected) {

        //Set ends coords      
        if (valuesSwapped) {
            if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
                resultCoords.addElement(y2);
                resultCoords.addElement(x2);
                resultCoords.addElement(y1);
                resultCoords.addElement(x1);
                resultCoords.addElement(connected); //Mark next result as linked

            } else {
                resultCoords.addElement(y2);
                resultCoords.addElement(x1);
                resultCoords.addElement(y1);
                resultCoords.addElement(x2);
                resultCoords.addElement(connected); //Mark next result as linked

            }
        } else {
            resultCoords.addElement(x1);
            resultCoords.addElement(y1);
            resultCoords.addElement(x2);
            resultCoords.addElement(y2);
            resultCoords.addElement(connected); //Mark next result as linked
        }
    }


    private void findTeaser(String teaser, final Matcher teaserFinder, final int termStarts, final int termEnds, final Vector_String resultTeasers) {

        if (teaserFinder.find()) {
            //Get a teaser if found and set the search term to bold is allowed
            if (teaserFinder.start() < termStarts && teaserFinder.end() > termEnds) {

                //replace default with found teaser
                teaser = teaserFinder.group();

                if (includeHTMLtags) {
                    //Calculate points to add bold tags
                    final int teaseStarts = termStarts - teaserFinder.start();
                    final int teaseEnds = (termEnds - teaserFinder.start()) + 1;

                    //Add bold tags
                    teaser = teaser.substring(0, teaseStarts) + ""
                            + teaser.substring(teaseStarts, teaseEnds) + ""
                            + teaser.substring(teaseEnds, teaser.length());
                }

                teaserFinder.region(termEnds + 1, teaserFinder.regionEnd());
            }
        }
        //Store teaser
        resultTeasers.addElement(teaser);
    }


    private static String alterStringTooDisplayOrder(final String testTerm) {

        String currentBlock = "";
        String searchValue = "";
        byte lastDirection = Character.getDirectionality(testTerm.charAt(0));
        for (int i = 0; i != testTerm.length(); i++) {
            byte dir = Character.getDirectionality(testTerm.charAt(i));

            //Only track is changing from left to right or right to left
            switch (dir) {
                case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
                case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
                case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
                case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
                    dir = Character.DIRECTIONALITY_RIGHT_TO_LEFT;
                    break;
                case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
                case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
                case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
                    dir = Character.DIRECTIONALITY_LEFT_TO_RIGHT;
                    break;
                default:
                    dir = lastDirection;
                    break;
            }


            if (dir != lastDirection) { //Save and reset block is direction changed
                searchValue += currentBlock;
                currentBlock = "";
                lastDirection = dir;
            }

            //Store value based on writing mode
            if (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT) {
                currentBlock = testTerm.charAt(i) + currentBlock;
            } else {
                currentBlock += testTerm.charAt(i);
            }
        }
        searchValue += currentBlock;

        return searchValue;
    }


    private String buildSearchText(final boolean includeCoords, final int mode) {
        //Portions of text to perform the search on and find teasers
        String searchText;

        //Merge all text into one with \n line separators
        //This will allow checking for multi line split results
        final StringBuilder str = new StringBuilder();
        for (int i = 0; i != lines.length; i++) {
            if (lines[i].getRawData() != null && mode == lines[i].getWritingMode()) {
                str.append(lines[i].getRawData()).append('\n');
            }
        }

        //Remove double spaces, replacing them with single spaces
        searchText = removeDuplicateSpaces(str.toString());

        //Strip xml and coords data from content and keep text data
        if (!includeCoords) {
            searchText = removeHiddenMarkers(searchText);
        }

        searchText = Strip.stripXML(searchText, true).toString();

        //Store text in the search and teaser arrays
        return searchText;
    }

    private static String removeDuplicateSpaces(String textValue) {

        if (textValue.contains("  ")) {

            textValue = textValue.replace("  ", " ");

        }
        return textValue;
    }

    private static int loadSearcherOptions(final int searchType) {
        //Bitwise flags for regular expressions engine, options always required 
        int options = 0;

        //Turn on case sensitive mode
        if ((searchType & SearchType.CASE_SENSITIVE) != SearchType.CASE_SENSITIVE) {
            options = (options | Pattern.CASE_INSENSITIVE);
        }

        //Allow search to find split line results
        if ((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS) {
            options = (options | Pattern.MULTILINE | Pattern.DOTALL);
        }

        return options;
    }

    private static int[] getWritingModeOrder(final int[] unsorted) {
        final int[] sorted = {unsorted[0], unsorted[1], unsorted[2], unsorted[3]};

        //Set all to -1 so we can tell if it's been set yet
        final int[] writingModes = {-1, -1, -1, -1};

        Arrays.sort(sorted);

        for (int i = 0; i != unsorted.length; i++) {
            for (int j = 0; j < sorted.length; j++) {
                if (unsorted[i] == sorted[j]) {

                    int pos = j - 3;
                    if (pos < 0) {
                        pos = -pos;
                    }

                    if (writingModes[pos] == -1) {
                        writingModes[pos] = i;
                        j = sorted.length;
                    }
                }
            }
        }
        return writingModes;
    }

    private int[] getWritingModeCounts(final Line[] items) {

        //check orientation and get preferred. Items not correct will be ignored
        int l2r = 0;
        int r2l = 0;
        int t2b = 0;
        int b2t = 0;

        for (int i = 0; i != items.length; i++) {
            switch (items[i].getWritingMode()) {
                case 0:
                    l2r++;
                    break;
                case 1:
                    r2l++;
                    break;
                case 2:
                    t2b++;
                    break;
                case 3:
                    b2t++;
                    break;
            }
        }

        return new int[]{l2r, r2l, t2b, b2t};
    }

    /**
     * remove shadows from text created by double printing of text and drowned
     * items where text inside other text
     */
    private void cleanupShadowsAndDrownedObjects(final boolean avoidSpaces) {

        //get list of items
//		final int[] items = getUnusedFragments();

        final int count = fragments.length;
        int master, child;
        String separator;
        float diff;

        //work through objects and eliminate shadows or roll together overlaps
        for (int p = 0; p < count; p++) {

            //master item
            master = p;

            //ignore used items

            //work out mid point in text
            float midX = (fragments[master].getX1() + fragments[master].getX2()) / 2;
            float midY = (fragments[master].getY1() + fragments[master].getY2()) / 2;

            for (int p2 = p + 1; p2 < count; p2++) {

                //item to test against
                child = p2;

                //Ignore localFragments that have been used or have no width
                if ((fragments[child].getX1() != fragments[child].getX2()) && (!fragments[child].hasMerged()) && (!fragments[master].hasMerged())) {

                    float fontDiff = fragments[child].getFontSize() - fragments[master].getFontSize();
                    if (fontDiff < 0) {
                        fontDiff = -fontDiff;
                    }

                    diff = (fragments[child].getX2() - fragments[child].getX1()) - (fragments[master].getX2() - fragments[master].getX1());
                    if (diff < 0) {
                        diff = -diff;
                    }

                    //stop spurious matches on overlapping text
                    if (fontDiff == 0 && (midX > fragments[child].getX1()) && (midX < fragments[child].getX2())
                            && (diff < 10)
                            && (midY < fragments[child].getY1()) && (midY > fragments[child].getY2())) {

                        fragments[child].setMerged(true);

                        //pick up drowned text items (item inside another)
                    } else {

                        final boolean a_in_b =
                                (fragments[child].getX1() > fragments[master].getX1()) && (fragments[child].getX2() < fragments[master].getX2())
                                        && (fragments[child].getY1() < fragments[master].getY1()) && (fragments[child].getY2() > fragments[master].getY2());
                        final boolean b_in_a =
                                (fragments[master].getX1() > fragments[child].getX1()) && (fragments[master].getX2() < fragments[child].getX2())
                                        && (fragments[master].getY1() < fragments[child].getY1()) && (fragments[master].getY2() > fragments[child].getY2());

                        //merge together
                        if (a_in_b || b_in_a) {
                            //get order right - bottom y2 underneath
                            if (fragments[master].getY2() > fragments[child].getY2()) {
                                separator = getLineDownSeparator(fragments[master].getRawData(), fragments[child].getRawData());
                                if ((!avoidSpaces) || (separator.indexOf(' ') == -1)) {
                                    merge(fragments[master], fragments[child], separator);
                                }
                            } else {
                                separator = getLineDownSeparator(fragments[child].getRawData(), fragments[master].getRawData());
                                if (!avoidSpaces || separator.indexOf(' ') == -1) {
                                    merge(fragments[master], fragments[child], separator);
                                }
                            }

                            //recalculate as may have changed
                            midX = (fragments[master].getX1() + fragments[master].getX2()) / 2;
                            midY = (fragments[master].getY1() + fragments[master].getY2()) / 2;

                        }
                    }
                }
            }

        }
    }

    /**
     * workout if we should use space, CR or no separator when joining lines
     */
    private static String getLineDownSeparator(final String rawLine1, final String rawLine2) {

        String returnValue = " "; //space is default

        final boolean hasUnderline = false;

        //get 2 lines without any XML or spaces so we can look at last char
        StringBuilder line1 = new StringBuilder(rawLine1);
        StringBuilder line2 = new StringBuilder(rawLine2);

        line1 = Strip.trim(line1);
        line2 = Strip.trim(line2);


        //get lengths and if appropriate perform tests
        final int line1Len = line1.length();
        final int line2Len = line2.length();

        if ((line1Len > 1) && (line2Len > 1)) {

            //get chars to test
            final char line1Char2 = line1.charAt(line1Len - 1);
            final char line1Char1 = line1.charAt(line1Len - 2);
            final char line2Char1 = line2.charAt(0);
            final char line2Char2 = line2.charAt(1);

            //deal with hyphenation first - ignore unless :- or space-
            final String hyphen_values = "";
            if (hyphen_values.indexOf(line1Char2) != -1) {
                returnValue = ""; //default of nothing
                if (line1Char1 == ':') {
                    returnValue = "\n";
                }
                if (line1Char2 == ' ') {
                    returnValue = " ";
                }
                //paragraph breaks if full stop and next line has ascii char or Capital Letter
            } else if (
                    ((line1Char1 == '.') || (line1Char2 == '.'))
                            && (Character.isUpperCase(line2Char1)
                            || (line2Char1 == '&')
                            || Character.isUpperCase(line2Char2)
                            || (line2Char2 == '&'))) {

                returnValue = "\n";
            }

        }

        //add an underline if appropriate
        if (hasUnderline) {
            returnValue += '\n';
        }

        return returnValue;
    }

    /**
     * general routine to see if we add a space between 2 text localFragments
     */
    private String isGapASpace(final int c, final int l, final float actualGap, final boolean addMultiplespaceXMLTag, final int writingMode) {
        String sep = "";
        float gap;

        //use smaller gap
        final float gapA = fragments[c].getSpaceWidth() * fragments[c].getFontSize();
        final float gapB = fragments[l].getSpaceWidth() * fragments[l].getFontSize();

        if (gapA > gapB) {
            gap = gapB;
        } else {
            gap = gapA;
        }

        gap = (actualGap / (gap / 1000));

        //Round values to closest full integer as float -> int conversion rounds down
        if (gap > 0.51f && gap < 1) {
            gap = 1;
        }

        final int spaceCount = (int) gap;

        if (spaceCount > 0) {
            sep = " ";
        }

        //add an XML tag to flag multiple spaces
        if (spaceCount > 1 && addMultiplespaceXMLTag && writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
            sep = " ";
        }

        return sep;
    }

    /**
     * convert localFragments into lines of text
     */
    @SuppressWarnings("unused")
    private void createLinesForSearch(final int mode, final boolean breakOnSpace, final boolean addMultiplespaceXMLTag, final boolean isSearch) throws PdfException {

        String separator;

        final boolean debug = false;

        //create local copies of arrays
        final Line[] localLines = fragments.clone();

//        final boolean[] isUsed = new boolean[lines.length];
        int finalCount = localLines.length;
        for (int i = 0; i != localLines.length; i++) {
            if (localLines[i].hasMerged) {
                finalCount--;
            }
        }

        //reverse order if text right to left
        if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
            for (int i = 0; i < localLines.length; i++) {
                localLines[i] = fragments[localLines.length - i - 1];
            }
        }

        //scan items joining best fit to right of each fragment to build lines.
        for (int master = 0; master < localLines.length; master++) {

            int id = -1;

            //float smallest_gap = -1, gap, yMidPt;
            if (!localLines[master].hasMerged() && localLines[master].getWritingMode() == mode) {

                if (debug) {
                    System.out.println("Look for match with " + removeHiddenMarkers(localLines[master].getRawData()));
                }

                for (int child = 0; child < localLines.length && id == -1; child++) {

                    /*
                     * Coordinates altered so x axis positive follows line direction 
                     * and y axis negative follows paragraph direction. 
                     * Coordinates in the order x1, y1, x2, y2
                    */
                    final float[] masterCoords = getCoordsForWritingMode(localLines[master], mode);
                    final float[] childCoords = getCoordsForWritingMode(localLines[child], mode);

                    if (!localLines[child].hasMerged() && master != child && localLines[master].getWritingMode() == localLines[child].getWritingMode() && childCoords[0] != childCoords[2]) {
                        if (debug) {
                            System.out.println("Checking " + removeHiddenMarkers(localLines[child].getRawData()));
                        }
                        //Get central points
                        final float mx = masterCoords[0] + ((masterCoords[2] - masterCoords[0]) / 2);
                        final float my = masterCoords[3] + ((masterCoords[1] - masterCoords[3]) / 2);
                        final float cx = childCoords[0] + ((childCoords[2] - childCoords[0]) / 2);
                        final float cy = childCoords[3] + ((childCoords[1] - childCoords[3]) / 2);

                        float smallestHeight = (masterCoords[1] - masterCoords[3]);
                        final float fontDifference = (childCoords[1] - childCoords[3]) - smallestHeight;
                        if (fontDifference < 0) {
                            smallestHeight = (childCoords[1] - childCoords[3]);
                        }

                        //Don't merge is font of 1 is twice the size
                        if (Math.abs(fontDifference) < smallestHeight * 2) {
                            //Check for the same line by checking the center of
                            //child is within master area
                            if (Math.abs(my - cy) < (smallestHeight * 0.5)) {
                                if (mx < cx) { //Child on right
                                    final float distance = childCoords[0] - masterCoords[2];
                                    if (distance <= smallestHeight / 2) {
                                        id = child;
                                    }
                                }
                            }
                        }
                        //Match has been found
                        if (id != -1) {
                            float possSpace = childCoords[0] - masterCoords[2];
                            if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
                                possSpace = -possSpace;
                            }

                            //add space if gap between this and last object
                            separator = isGapASpace(master, id, possSpace, addMultiplespaceXMLTag, mode);

                            //merge if adjoin
                            if (breakOnSpace && separator.startsWith(" ")) {
                                break;
                            }

                            if (debug) {
                                System.out.println("Merge items " + master + " & " + id);
                                System.out.println("c  : " + removeHiddenMarkers(localLines[master].getRawData()));
                                System.out.println("id : " + removeHiddenMarkers(localLines[id].getRawData()));
                                System.out.println("");
                            }

                            if ((isSearch && (child != master
                                    && ((childCoords[0] > masterCoords[0] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM)
                                    || (childCoords[0] < masterCoords[0] && mode == PdfData.VERTICAL_TOP_TO_BOTTOM)
                                    && localLines[master].getWritingMode() == mode)))
                                    || (!isSearch && (child != master && ((childCoords[0] > masterCoords[0] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM)
                                    || childCoords[0] < masterCoords[0] && mode == PdfData.VERTICAL_TOP_TO_BOTTOM && localLines[master].getWritingMode() == mode)))) { //see if on right

                                merge(localLines[master], localLines[id], separator);
                                finalCount--;


                            }

                            id = -1;
                        }
                    }
                }
            }
        }
        lines = new Line[finalCount];
        int next = 0;
        for (int i = 0; i != localLines.length; i++) {
            if (!localLines[i].hasMerged()) {
                lines[next] = localLines[i];
                next++;
            }
        }
    }

    private float[] getCoordsForWritingMode(final Line line, final int mode) throws PdfException {
        final float[] results = new float[4];
        //set pointers so left to right text
        switch (mode) {
            case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
                results[0] = line.getX1();
                results[2] = line.getX2();
                results[1] = line.getY1();
                results[3] = line.getY2();
                break;
            case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                results[2] = line.getX1();
                results[0] = line.getX2();
                results[1] = line.getY1();
                results[3] = line.getY2();
                break;
            case PdfData.VERTICAL_BOTTOM_TO_TOP:
                results[0] = line.getY2();
                results[2] = line.getY1();
                results[1] = line.getX2();
                results[3] = line.getX1();
                break;
            case PdfData.VERTICAL_TOP_TO_BOTTOM:
                results[0] = line.getY2();
                results[2] = line.getY1();
                results[3] = line.getX1();
                results[1] = line.getX2();
                break;
            default:
                throw new PdfException("Illegal value " + mode + " for currentWritingMode");
        }

        return results;
    }

    /**
     * merge 2 text localFragments together and update co-ordinates
     */
    private void merge(final Line master, final Line child, final String separator) {

        //update co-ords
        if (master.getX1() > child.getX1()) {
            master.setX1(child.getX1());
        }
        if (master.getY1() < child.getY1()) {
            master.setY1(child.getY1());
        }
        if (master.getX2() < child.getX2()) {
            master.setX2(child.getX2());
        }
        if (master.getY2() > child.getY2()) {
            master.setY2(child.getY2());
        }

        final String test = Fonts.fe;
        StringBuilder masterString = new StringBuilder(master.getRawData());
        final StringBuilder childString = new StringBuilder(child.getRawData());

        //move  if needed and add separator
        if ((masterString.toString().lastIndexOf(test) != -1)) {
            final String masterLocal = masterString.toString();
            masterString = new StringBuilder(masterLocal.substring(0, masterLocal.lastIndexOf(test)));
            masterString.append(separator);
            masterString.append(masterLocal.substring(masterLocal.lastIndexOf(test)));
        } else {
            masterString.append(separator);
        }

        //Only map out space if text length is longer than 1
        if (child.getTextLength() > 1 && masterString.toString().endsWith(" ")) {
            masterString.deleteCharAt(masterString.lastIndexOf(" "));
        }
        //use font size of second text (ie at end of merged text)
        master.setFontSize(child.getFontSize());

        //Remove excess / redundent xml tags
        if ((childString.indexOf("", masterString.lastIndexOf("") + 7 == masterString.lastIndexOf(">"))) {
            childString.replace(childString.indexOf("") + 1, "");
            masterString.replace(masterString.lastIndexOf(""), masterString.lastIndexOf("") + 8, "");
        }

        if ((childString.indexOf("", masterString.lastIndexOf("") + 6 == masterString.lastIndexOf(">"))) {
            childString.replace(childString.indexOf("") + 1, "");
            masterString.replace(masterString.lastIndexOf(""), masterString.lastIndexOf("") + 7, "");
        }

        masterString = masterString.append(childString);

        //track length of text less all tokens
        master.setTextLength(master.getTextLength() + child.getTextLength());

        //set objects to null to flush and log as used
        child.setRawData(null);
        child.setMerged(true);

        master.setRawData(masterString.toString());

//            //use font size of second text (ie at end of merged text)
//            master.setFontSize(child.getFontSize());
//
//            //add together
//            StringBuilder content = new StringBuilder();
//            content.append(master.getRawData()).append(separator).append(child.getRawData());
//            master.setRawData(content.toString());
//
//            //track length of text less all tokens
//            master.setTextLength(master.getTextLength()+child.getTextLength());
//
//            //set objects to null to flush and log as used
//            child.setRawData(null);
//            child.setMerged(true);
    }

    private void copyToArrays() {

        final int count = pdf_data.getRawTextElementCount();

        fragments = new Line[count];

        //set values
        for (int i = 0; i < count; i++) {
            fragments[i] = new Line(pdf_data, i);
        }
    }

    /**
     * sets if we include HTML in teasers
     * (do we want this is word or this is word as teaser)
     *
     * @param value True to include HTML, otherwise false
     */
    protected void setIncludeHTML(final boolean value) {
        includeHTMLtags = value;
    }

    /**
     * Flag if teasers should be generated whilst searching
     *
     * @param value True to generate teasers, otherwise false
     */
    protected void generateTeasers(final boolean value) {
        includeTease = value;
    }

    /**
     * Return flag to control teaser generation
     *
     * @return True if teasers are being generated, otherwise false
     */
    protected boolean isGeneratingTeasers() {
        return includeTease;
    }

    private class Line implements Comparable {
        private float x1, y1, x2, y2, character_spacing, spaceWidth;
        private String raw, currentColor;
        private int text_length, mode, fontSize;
        private boolean hasMerged;

        Line(final PdfData pdf_data, final int index) {
            loadData(pdf_data, index);
        }

        private void loadData(final PdfData pdf_data, final int index) {
            //extract values
            character_spacing = pdf_data.f_character_spacing[index];
            x1 = pdf_data.f_x1[index];
            x2 = pdf_data.f_x2[index];
            y1 = pdf_data.f_y1[index];
            y2 = pdf_data.f_y2[index];
            currentColor = pdf_data.colorTag[index];
            text_length = pdf_data.text_length[index];
            mode = pdf_data.f_writingMode[index];
            raw = pdf_data.contents[index];
            fontSize = pdf_data.f_end_font_size[index];
            spaceWidth = pdf_data.space_width[index];
            hasMerged = false;
        }

        protected float getX1() {
            return x1;
        }

        protected float getY1() {
            return y1;
        }

        protected float getX2() {
            return x2;
        }

        protected float getY2() {
            return y2;
        }

        protected float getCharacterSpacing() {
            return character_spacing;
        }

        protected float getSpaceWidth() {
            return spaceWidth;
        }

        protected String getRawData() {
            return raw;
        }

        protected String getColorTag() {
            return currentColor;
        }

        protected int getWritingMode() {
            return mode;
        }

        protected int getTextLength() {
            return text_length;
        }

        protected int getFontSize() {
            return fontSize;
        }

        protected boolean hasMerged() {
            return hasMerged;
        }

        protected void setX1(final float value) {
            x1 = value;
        }

        protected void setY1(final float value) {
            y1 = value;
        }

        protected void setX2(final float value) {
            x2 = value;
        }

        protected void setY2(final float value) {
            y2 = value;
        }

        protected void setFontSize(final int value) {
            fontSize = value;
        }

        protected void setRawData(final String value) {
            raw = value;
        }

        protected void setTextLength(final int value) {
            text_length = value;
        }

        protected void setMerged(final boolean value) {
            hasMerged = value;
        }

        @Override
        public int compareTo(final Line o) {
            switch (mode) {
                case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
                case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                    return (int) (y1 - o.getY1());
                case PdfData.VERTICAL_TOP_TO_BOTTOM:
                case PdfData.VERTICAL_BOTTOM_TO_TOP:
                    return (int) (x1 - o.getX1());
            }
            return 0;
        }
    }
}
Related Artifacts