org.jpedal.grouping.PdfSearchUtils Maven / Gradle / Ivy

/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/support/
 *
 * (C) Copyright 1997-2017 IDRsolutions and Contributors.
 *
 * This file is part of JPedal/JPDF2HTML5
 *
 @LICENSE@
 *
 * ---------------
 * PdfSearchUtils.java
 * ---------------
 */
package org.jpedal.grouping;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jpedal.exception.PdfException;

import static org.jpedal.grouping.PdfGroupingAlgorithms.removeHiddenMarkers;

import org.jpedal.objects.PdfData;
import org.jpedal.utils.Fonts;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Strip;
import org.jpedal.utils.repositories.Vector_Float;
import org.jpedal.utils.repositories.Vector_String;

public class PdfSearchUtils {

    private boolean includeHTMLtags;

    private final List multipleTermTeasers = new ArrayList();

    //Hold data from pdf so we can create local version
    private final PdfData pdf_data;

    private Line[] fragments;
    private Line[] lines;

    //Value placed between result areas to show they are part of the same result
    private static final int MULTIPLE_AREA_RESULT = -101;

    private boolean includeTease;

    protected PdfSearchUtils(final PdfData pdf_data) {
        this.pdf_data = pdf_data;
    }

    /**
     * Search a particular area with in pdf page currently loaded and return the areas
     * of the results found as an array of float values.
     *
     * @param x1         is the x coord of the top left corner
     * @param y1         is the y coord of the top left corner
     * @param x2         is the x coord of the bottom right corner
     * @param y2         is the y coord of the bottom right corner
     * @param terms      : String[] of search terms, each String is treated as a single term
     * @param searchType : int containing bit flags for the search (See class SearchType)
     * @return the coords of the found text in a float[] where the coords are pdf page coords.
     * The origin of the coords is the bottom left hand corner (on unrotated page) organised in the following order.

     * [0]=result x1 coord

     * [1]=result y1 coord

     * [2]=result x2 coord

     * [3]=result y2 coord

     * [4]=either -101 to show that the next text area is the remainder of this word on another line else any other value is ignored.

     * @throws PdfException if the page content being search contains invalid data that the search can not recover from
     */
    @SuppressWarnings("UnusedParameters")
    protected final float[] findText(
            int x1,
            int y1,
            int x2,
            int y2,
            final String[] terms,
            final int searchType)
            throws PdfException {

        //Failed to supply search terms to do nothing
        if (terms == null) {
            return new float[]{};
        }

        //Search result and teaser holders
        final Vector_Float resultCoords = new Vector_Float(0);
        final Vector_String resultTeasers = new Vector_String(0);

        //make sure co-ords valid and throw exception if not
        final int[] v = validateCoordinates(x1, y1, x2, y2);
        x1 = v[0];
        y1 = v[1];
        x2 = v[2];
        y2 = v[3];

        //Extract the text data into local arrays for searching
        copyToArraysPartial(x1, y2, x2, y1);

        //Remove any hidden text on page as should not be found
        cleanupShadowsAndDrownedObjects(false);

        //Get unused text objects and sort them for correct searching
        final Line[] localLines = fragments.clone();

        final int[] unsorted = getWritingModeCounts(localLines);
        final int[] writingModes = getWritingModeOrder(unsorted);

        for (int u = 0; u != writingModes.length; u++) {

            final int mode = writingModes[u];

            //if not lines for writing mode, ignore
            if (unsorted[mode] != 0) {
                searchWritingMode(mode, searchType, terms, resultCoords, resultTeasers);
            }

        }

        //Return coord data for search results
        return resultCoords.get();

    }

    /**
     * return text teasers from findtext if generateTeasers() called before find
     *
     * @return String[] representing teasers for each result (single of linked areas) in result order
     */
    protected String[] getTeasers() {
        return multipleTermTeasers.toArray(new String[multipleTermTeasers.size()]);
    }

    /**
     * put raw data into Arrays for quick merging breakup_fragments shows if we
     * break on vertical lines and spaces
     */
    private void copyToArraysPartial(final int minX, final int minY, final int maxX, final int maxY) {

        final int count = pdf_data.getRawTextElementCount();

        final Line[] localFragments = new Line[count];

        int currentPoint = 0;

        final String marker = PdfData.marker;

        //set values
        for (int i = 0; i < count; i++) {

            //if at least partly in the area, process
            if (isFragmentWithinArea(pdf_data, i, minX, minY, maxX, maxY)) {

                final int mode = pdf_data.f_writingMode[i];

                localFragments[currentPoint] = new Line(pdf_data, i);

                final StringBuilder startTags = new StringBuilder(localFragments[currentPoint].getRawData().substring(0, localFragments[currentPoint].getRawData().indexOf(marker)));
                final String contentText = localFragments[currentPoint].getRawData().substring(localFragments[currentPoint].getRawData().indexOf(marker), localFragments[currentPoint].getRawData().indexOf('<', localFragments[currentPoint].getRawData().lastIndexOf(marker)));
                String endTags = localFragments[currentPoint].getRawData().substring(localFragments[currentPoint].getRawData().lastIndexOf(marker));
                //Skips last section of text
                endTags = endTags.substring(endTags.indexOf('<'));

                final StringTokenizer tokenizer = new StringTokenizer(contentText, marker);
                boolean setX1 = true;
                float width = 0;

                while (tokenizer.hasMoreTokens()) {

                    String token = tokenizer.nextToken();
                    final float xCoord = (Float.parseFloat(token));

                    token = tokenizer.nextToken();
                    width = Float.parseFloat(token);

                    token = tokenizer.nextToken();
                    final String character = token;

                    if (setX1) {
                        if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) {
                            localFragments[currentPoint].setX1(xCoord);
                        } else {
                            localFragments[currentPoint].setY2(xCoord);
                        }
                        setX1 = false;
                    }

                    if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) {
                        localFragments[currentPoint].setX2(xCoord);
                    } else {
                        localFragments[currentPoint].setY1(xCoord);
                    }

                    boolean storeValues = false;
                    if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) {
                        if (minX < xCoord && (xCoord + width) < maxX) {
                            storeValues = true;
                        }
                    } else {
                        if (minY < xCoord && (xCoord + width) < maxY) {
                            storeValues = true;
                        }
                    }
                    if (storeValues) {
                        startTags.append(marker);
                        startTags.append(xCoord); //Add X Coord

                        startTags.append(marker);
                        startTags.append(width); //Add Width

                        startTags.append(marker);
                        startTags.append(character); //Add Letter


                    }

                }

                localFragments[currentPoint].setRawData(startTags.append(endTags).toString());

                if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) {
                    localFragments[currentPoint].setX2(localFragments[currentPoint].getX2() + width);
                } else {
                    localFragments[currentPoint].setY1(localFragments[currentPoint].getY1() + width);
                }

                currentPoint++;
            }
        }

        fragments = new Line[currentPoint];

        for (int i = 0; i != currentPoint; i++) {
            fragments[i] = localFragments[i];
        }
    }

    private static boolean isFragmentWithinArea(final PdfData pdf_data, final int i, final int minX, final int minY, final int maxX, final int maxY) {

        //extract values
        final float x1 = pdf_data.f_x1[i];
        final float x2 = pdf_data.f_x2[i];
        final float y1 = pdf_data.f_y1[i];
        final float y2 = pdf_data.f_y2[i];
        final int mode = pdf_data.f_writingMode[i];

        final float height;

        switch (mode) {
            case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
            case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                height = y1 - y2;
                if ((((minX < x1 && x1 < maxX) || (minX < x2 && x2 < maxX)) || //Area contains the x1 or x2 coords
                        ((x1 < minX && minX < x2) || (x1 < maxX && maxX < x2)) //Area is within the x1 and x2 coords
                )
                        && (minY < y2 + (height / 4) && y2 + (height * 0.75) < maxY) //Area also contains atleast 3/4 of the text y coords
                        ) {
                    return true;
                }
                break;
            case PdfData.VERTICAL_BOTTOM_TO_TOP:
            case PdfData.VERTICAL_TOP_TO_BOTTOM:
                height = x2 - x1;
                if ((((minY < y1 && y1 < maxY) || (minY < y2 && y2 < maxY)) || //Area contains the x1 or x2 coords
                        ((y2 < minY && minY < y1) || (y2 < maxY && maxY < y1)) //Area is within the x1 and x2 coords
                )
                        && (minX < x1 + (height / 4) && x1 + (height * 0.75) < maxX) //Area also contains atleast 3/4 of the text y coords
                        ) {
                    return true;
                }
                break;
        }
        return false;
    }

    /**
     * make sure co-ords valid and throw exception if not
     */
    private static int[] validateCoordinates(int x1, int y1, int x2, int y2) {
        if ((x1 > x2) | (y1 < y2)) {
            if (x1 > x2) {
                final int temp = x1;
                x1 = x2;
                x2 = temp;
                LogWriter.writeLog("x1 > x2, coordinates were swapped to validate");
            }

            if (y1 < y2) {
                final int temp = y1;
                y1 = y2;
                y2 = temp;
                LogWriter.writeLog("y1 < y2, coordinates were swapped to validate");
            }
        }
        return new int[]{x1, y1, x2, y2};
    }

    //

    /**
     * Search with in pdf page currently loaded and return the areas
     * of the results found as an array of float values.
     * 
     * Method to find text in the specified area allowing for the text to be split across multiple lines.

     *
     * @param terms      = the text to search for
     * @param searchType = info on how to search the pdf
     * @return the coords of the found text in a float[] where the coords are pdf page coords.
     * The origin of the coords is the bottom left hand corner (on unrotated page) organised in the following order.

     * [0]=result x1 coord

     * [1]=result y1 coord

     * [2]=result x2 coord

     * [3]=result y2 coord

     * [4]=either -101 to show that the next text area is the remainder of this word on another line else any other value is ignored.

     * @throws PdfException if the page content being search contains invalid data that the search can not recover from
     */
    protected final float[] findText(
            final String[] terms,
            final int searchType)
            throws PdfException {

        //Failed to supply search terms to do nothing
        if (terms == null) {
            return new float[]{};
        }

        //Search result and teaser holders
        final Vector_Float resultCoords = new Vector_Float(0);
        final Vector_String resultTeasers = new Vector_String(0);

        //Extract the text data into local arrays for searching
        copyToArrays();

        //Remove any hidden text on page as should not be found
        cleanupShadowsAndDrownedObjects(false);

        //Get unused text objects and sort them for correct searching
//		final int[] items = getsortedUnusedFragments(true, false);
        final Line[] localLines = fragments.clone();

        final int[] unsorted = getWritingModeCounts(localLines);
        final int[] writingModes = getWritingModeOrder(unsorted);

        for (int u = 0; u != writingModes.length; u++) {

            final int mode = writingModes[u];

            if (unsorted[mode] != 0) {
                searchWritingMode(mode, searchType, terms, resultCoords, resultTeasers);
            }
        }
        //Return coord data for search results
        return resultCoords.get();

    }


    private void searchWritingMode(final int mode, final int searchType, final String[] terms, final Vector_Float resultCoords, final Vector_String resultTeasers) throws PdfException {

        //Flags to control the different search options
        boolean firstOccuranceOnly = false;
        boolean wholeWordsOnly = false;
        boolean foundFirst = false;
        boolean useRegEx = false;

        //Merge text localFragments into lines as displayed on page
        createLinesForSearch(mode, false, false, true);

        //Bitwise flags for regular expressions engine, options always required 
        final int options = loadSearcherOptions(searchType);

        //Only find first occurance of each search term
        if ((searchType & SearchType.FIND_FIRST_OCCURANCE_ONLY) == SearchType.FIND_FIRST_OCCURANCE_ONLY) {
            firstOccuranceOnly = true;
        }

        //Only find whole words, not partial words
        if ((searchType & SearchType.WHOLE_WORDS_ONLY) == SearchType.WHOLE_WORDS_ONLY) {
            wholeWordsOnly = true;
        }

        //Allow the use of regular expressions symbols
        if ((searchType & SearchType.USE_REGULAR_EXPRESSIONS) == SearchType.USE_REGULAR_EXPRESSIONS) {
            useRegEx = true;
        }

        //Check if coords need swapping
        final boolean valuesSwapped = (mode == PdfData.VERTICAL_BOTTOM_TO_TOP || mode == PdfData.VERTICAL_TOP_TO_BOTTOM);

        //Portions of text to perform the search on and find teasers
        final String searchText = buildSearchText(false, mode);
        final String coordsText = buildSearchText(true, mode);

        //Hold starting point data at page rotation
        int[] resultStart;

        //Work through the search terms one at a time
        for (int j = 0; j != terms.length; j++) {

            String searchValue = alterStringTooDisplayOrder(terms[j]);

            //Set the default separator between words in a search term
            String sep = " ";

            //Multiline needs space or newline to be recognised as word separators
            if ((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS) {
                sep = "[ \\\\n]+";
            }

            //if not using reg ex add reg ex literal flags around the text and word separators
            if (!useRegEx) {
                searchValue = "\\Q" + searchValue + "\\E";
                sep = "\\\\E" + sep + "\\\\Q";
            }

            //If word seperator has changed, replace all spaces with modified seperator
            if (!sep.equals(" ")) {
                searchValue = searchValue.replaceAll(" ", sep);
            }

            //Surround search term with word boundry tags to match whole words
            if (wholeWordsOnly) {
                searchValue = "\\b" + searchValue + "\\b";
            }

            //Create pattern to match search term
            final Pattern searchTerm = Pattern.compile(searchValue, options);

            //Create pattern to match search term with two words before and after
            final Pattern teaserTerm = Pattern.compile("(?:\\S+\\s)?\\S*(?:\\S+\\s)?\\S*" + searchValue + "\\S*(?:\\s\\S+)?\\S*(?:\\s\\S+)?", options);

            //So long as text data is not null
            if (searchText != null) {

                //Create two matchers for finding search term and teaser
                final Matcher termFinder = searchTerm.matcher(searchText);
                final Matcher teaserFinder = teaserTerm.matcher(searchText);
                final boolean needToFindTeaser = true;

                //Keep looping till no result is returned
                while (termFinder.find()) {
                    resultStart = null;
                    //Make note of the text found and index in the text
                    String foundTerm = termFinder.group();
                    final int termStarts = termFinder.start();
                    final int termEnds = termFinder.end() - 1;

                    //If storing teasers
                    if (includeTease) {

                        if (includeHTMLtags) {
                            foundTerm = "" + foundTerm + "";
                        }

                        if (needToFindTeaser) {
                            findTeaser(foundTerm, teaserFinder, termStarts, termEnds, resultTeasers);
                        }
                    }

                    getResultCoords(coordsText, mode, resultStart, termStarts, termEnds, valuesSwapped, resultCoords);

                    //If only finding first occurance,
                    //Stop searching this text data for search term.
                    if (firstOccuranceOnly) {
                        foundFirst = true;
                        break;
                    }
                }

                //If only finding first occurance and first is found,
                //Stop searching all text data for this search term.
                if (firstOccuranceOnly && foundFirst) {
                    break;
                }
            }
        }

        //Remove any trailing empty values
        resultCoords.trim();

        //If including tease values
        if (includeTease) {
            storeTeasers(resultTeasers);
        }

    }


    private void getResultCoords(final String coordText, final int mode, int[] resultStart, int termStarts, final int termEnds, final boolean valuesSwapped, final Vector_Float resultCoords) {

        //Get coords of found text for highlights
        float currentX;
        float width;

        final char MARKER2 = PdfGroupingAlgorithms.MARKER2;

        //Track point in text data line (without coord data)
        int pointInLine = -1;

        //Track line on page
        int lineCounter = 0;

        //Skip null values and value not in the correct writing mode to ensure correct result coords
        while (lines[lineCounter].getRawData() == null ||
                Strip.stripXML(lines[lineCounter].getRawData(), true).toString().isEmpty() ||
                mode != lines[lineCounter].getWritingMode()) {
            lineCounter++;
        }

        //Flags used to catch if result is split accross lines
        boolean startFound = false;
        boolean endFound = false;

        //Cycle through coord text looking for coords of this result
        //Ignore first value as it is known to be the first marker
        for (int pointer = 1; pointer < coordText.length(); pointer++) {

            // find second marker and get x coord
            int startPointer = pointer;
            while (pointer < coordText.length()) {
                if (coordText.charAt(pointer) == MARKER2) {
                    break;
                }
                pointer++;
            }

            //Convert text to float value for x coord
            currentX = Float.parseFloat(coordText.substring(startPointer, pointer));
            pointer++;

            // find third marker and get width
            startPointer = pointer;
            while (pointer < coordText.length()) {
                if (coordText.charAt(pointer) == MARKER2) {
                    break;
                }

                pointer++;
            }

            //Convert text to float value for character width
            width = Float.parseFloat(coordText.substring(startPointer, pointer));
            pointer++;

            // find fourth marker and get text (character)
            startPointer = pointer;
            while (pointer < coordText.length()) {
                if (coordText.charAt(pointer) == MARKER2) {
                    break;
                }

                pointer++;
            }

            //Store text to check for newline character later
            final String text = coordText.substring(startPointer, pointer);
            pointInLine += text.length();

            //Start of term not found yet.
            //Point in line is equal to or greater than start of the term.
            //Store coords and mark start as found.
            if (!startFound && pointInLine >= termStarts) {
                int currentY = (int) lines[lineCounter].getY1();
                if (valuesSwapped) {
                    currentY = (int) lines[lineCounter].getX2();
                }
                resultStart = new int[]{(int) currentX, currentY};
                startFound = true;
            }

            //End of term not found yet.
            //Point in line is equal to or greater than end of the term.
            //Store coords and mark end as found.
            if (!endFound && pointInLine >= termEnds) {
                int currentY = (int) lines[lineCounter].getY2();
                if (valuesSwapped) {
                    currentY = (int) lines[lineCounter].getX1();
                }
                storeResultsCoords(valuesSwapped, mode, resultCoords, resultStart[0], resultStart[1], (currentX + width), currentY, 0.0f);

                endFound = true;
            }

            //Using multi line option.
            //Start of term found.
            //End of term not found.
            //New line character found.
            //Set up multi line result.
            if (startFound && !endFound && text.contains("\n")) {

                storeResultsCoords(valuesSwapped, mode, resultCoords, resultStart[0], resultStart[1], (currentX + width), lines[lineCounter].getY2(), MULTIPLE_AREA_RESULT);

                //Set start of term as not found
                startFound = false;

                //Set this point in line as start of next term
                //Guarantees next character is found as 
                //start of the next part of the search term
                termStarts = pointInLine;
            }

            //In multiline mode we progress the line number when we find a \n
            //This is to allow the correct calculation of y coords
            if (text.contains("\n")) {
                lineCounter++;

                //If current content pointed at is null or not the correct writing mode, skip value until data is found
                while (lineCounter < lines.length && (lines[lineCounter].getRawData() == null ||
                        Strip.stripXML(lines[lineCounter].getRawData(), true).toString().isEmpty() ||
                        mode != lines[lineCounter].getWritingMode())) {
                    lineCounter++;
                }
            }

        }
    }

    protected void clearStoredTeasers() {
        multipleTermTeasers.clear();
    }

    private void storeTeasers(final Vector_String resultTeasers) {

        //Remove any trailing empty values
        resultTeasers.trim();
        final String[] results = resultTeasers.get();
        for (int i = 0; i != results.length; i++) {
            multipleTermTeasers.add(results[i]);
        }

        //Prevent issue this not getting cleared between writing modes 
        //resulting in duplicate teasers
        resultTeasers.clear();
    }


    private static void storeResultsCoords(final boolean valuesSwapped, final int mode, final Vector_Float resultCoords, final float x1, final float y1, final float x2, final float y2, final float connected) {

        //Set ends coords      
        if (valuesSwapped) {
            if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
                resultCoords.addElement(y2);
                resultCoords.addElement(x2);
                resultCoords.addElement(y1);
                resultCoords.addElement(x1);
                resultCoords.addElement(connected); //Mark next result as linked

            } else {
                resultCoords.addElement(y2);
                resultCoords.addElement(x1);
                resultCoords.addElement(y1);
                resultCoords.addElement(x2);
                resultCoords.addElement(connected); //Mark next result as linked

            }
        } else {
            resultCoords.addElement(x1);
            resultCoords.addElement(y1);
            resultCoords.addElement(x2);
            resultCoords.addElement(y2);
            resultCoords.addElement(connected); //Mark next result as linked
        }
    }


    private void findTeaser(String teaser, final Matcher teaserFinder, final int termStarts, final int termEnds, final Vector_String resultTeasers) {

        if (teaserFinder.find()) {
            //Get a teaser if found and set the search term to bold is allowed
            if (teaserFinder.start() < termStarts && teaserFinder.end() > termEnds) {

                //replace default with found teaser
                teaser = teaserFinder.group();

                if (includeHTMLtags) {
                    //Calculate points to add bold tags
                    final int teaseStarts = termStarts - teaserFinder.start();
                    final int teaseEnds = (termEnds - teaserFinder.start()) + 1;

                    //Add bold tags
                    teaser = teaser.substring(0, teaseStarts) + ""
                            + teaser.substring(teaseStarts, teaseEnds) + ""
                            + teaser.substring(teaseEnds, teaser.length());
                }

                teaserFinder.region(termEnds + 1, teaserFinder.regionEnd());
            }
        }
        //Store teaser
        resultTeasers.addElement(teaser);
    }


    private static String alterStringTooDisplayOrder(final String testTerm) {

        String currentBlock = "";
        String searchValue = "";
        byte lastDirection = Character.getDirectionality(testTerm.charAt(0));
        for (int i = 0; i != testTerm.length(); i++) {
            byte dir = Character.getDirectionality(testTerm.charAt(i));

            //Only track is changing from left to right or right to left
            switch (dir) {
                case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
                case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
                case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
                case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
                    dir = Character.DIRECTIONALITY_RIGHT_TO_LEFT;
                    break;
                case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
                case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
                case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
                    dir = Character.DIRECTIONALITY_LEFT_TO_RIGHT;
                    break;
                default:
                    dir = lastDirection;
                    break;
            }


            if (dir != lastDirection) { //Save and reset block is direction changed
                searchValue += currentBlock;
                currentBlock = "";
                lastDirection = dir;
            }

            //Store value based on writing mode
            if (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT) {
                currentBlock = testTerm.charAt(i) + currentBlock;
            } else {
                currentBlock += testTerm.charAt(i);
            }
        }
        searchValue += currentBlock;

        return searchValue;
    }


    private String buildSearchText(final boolean includeCoords, final int mode) {
        //Portions of text to perform the search on and find teasers
        String searchText;

        //Merge all text into one with \n line separators
        //This will allow checking for multi line split results
        final StringBuilder str = new StringBuilder();
        for (int i = 0; i != lines.length; i++) {
            if (lines[i].getRawData() != null && mode == lines[i].getWritingMode()) {
                str.append(lines[i].getRawData()).append('\n');
            }
        }

        //Remove double spaces, replacing them with single spaces
        searchText = removeDuplicateSpaces(str.toString());

        //Strip xml and coords data from content and keep text data
        if (!includeCoords) {
            searchText = removeHiddenMarkers(searchText);
        }

        searchText = Strip.stripXML(searchText, true).toString();

        //Store text in the search and teaser arrays
        return searchText;
    }

    private static String removeDuplicateSpaces(String textValue) {

        if (textValue.contains("  ")) {

            textValue = textValue.replace("  ", " ");

        }
        return textValue;
    }

    private static int loadSearcherOptions(final int searchType) {
        //Bitwise flags for regular expressions engine, options always required 
        int options = 0;

        //Turn on case sensitive mode
        if ((searchType & SearchType.CASE_SENSITIVE) != SearchType.CASE_SENSITIVE) {
            options = (options | Pattern.CASE_INSENSITIVE);
        }

        //Allow search to find split line results
        if ((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS) {
            options = (options | Pattern.MULTILINE | Pattern.DOTALL);
        }

        return options;
    }

    private static int[] getWritingModeOrder(final int[] unsorted) {
        final int[] sorted = {unsorted[0], unsorted[1], unsorted[2], unsorted[3]};

        //Set all to -1 so we can tell if it's been set yet
        final int[] writingModes = {-1, -1, -1, -1};

        Arrays.sort(sorted);

        for (int i = 0; i != unsorted.length; i++) {
            for (int j = 0; j < sorted.length; j++) {
                if (unsorted[i] == sorted[j]) {

                    int pos = j - 3;
                    if (pos < 0) {
                        pos = -pos;
                    }

                    if (writingModes[pos] == -1) {
                        writingModes[pos] = i;
                        j = sorted.length;
                    }
                }
            }
        }
        return writingModes;
    }

    private int[] getWritingModeCounts(final Line[] items) {

        //check orientation and get preferred. Items not correct will be ignored
        int l2r = 0;
        int r2l = 0;
        int t2b = 0;
        int b2t = 0;

        for (int i = 0; i != items.length; i++) {
            switch (items[i].getWritingMode()) {
                case 0:
                    l2r++;
                    break;
                case 1:
                    r2l++;
                    break;
                case 2:
                    t2b++;
                    break;
                case 3:
                    b2t++;
                    break;
            }
        }

        return new int[]{l2r, r2l, t2b, b2t};
    }

    /**
     * remove shadows from text created by double printing of text and drowned
     * items where text inside other text
     */
    private void cleanupShadowsAndDrownedObjects(final boolean avoidSpaces) {

        //get list of items
//		final int[] items = getUnusedFragments();

        final int count = fragments.length;
        int master, child;
        String separator;
        float diff;

        //work through objects and eliminate shadows or roll together overlaps
        for (int p = 0; p < count; p++) {

            //master item
            master = p;

            //ignore used items

            //work out mid point in text
            float midX = (fragments[master].getX1() + fragments[master].getX2()) / 2;
            float midY = (fragments[master].getY1() + fragments[master].getY2()) / 2;

            for (int p2 = p + 1; p2 < count; p2++) {

                //item to test against
                child = p2;

                //Ignore localFragments that have been used or have no width
                if ((fragments[child].getX1() != fragments[child].getX2()) && (!fragments[child].hasMerged()) && (!fragments[master].hasMerged())) {

                    float fontDiff = fragments[child].getFontSize() - fragments[master].getFontSize();
                    if (fontDiff < 0) {
                        fontDiff = -fontDiff;
                    }

                    diff = (fragments[child].getX2() - fragments[child].getX1()) - (fragments[master].getX2() - fragments[master].getX1());
                    if (diff < 0) {
                        diff = -diff;
                    }

                    //stop spurious matches on overlapping text
                    if (fontDiff == 0 && (midX > fragments[child].getX1()) && (midX < fragments[child].getX2())
                            && (diff < 10)
                            && (midY < fragments[child].getY1()) && (midY > fragments[child].getY2())) {

                        fragments[child].setMerged(true);

                        //pick up drowned text items (item inside another)
                    } else {

                        final boolean a_in_b =
                                (fragments[child].getX1() > fragments[master].getX1()) && (fragments[child].getX2() < fragments[master].getX2())
                                        && (fragments[child].getY1() < fragments[master].getY1()) && (fragments[child].getY2() > fragments[master].getY2());
                        final boolean b_in_a =
                                (fragments[master].getX1() > fragments[child].getX1()) && (fragments[master].getX2() < fragments[child].getX2())
                                        && (fragments[master].getY1() < fragments[child].getY1()) && (fragments[master].getY2() > fragments[child].getY2());

                        //merge together
                        if (a_in_b || b_in_a) {
                            //get order right - bottom y2 underneath
                            if (fragments[master].getY2() > fragments[child].getY2()) {
                                separator = getLineDownSeparator(fragments[master].getRawData(), fragments[child].getRawData());
                                if ((!avoidSpaces) || (separator.indexOf(' ') == -1)) {
                                    merge(fragments[master], fragments[child], separator);
                                }
                            } else {
                                separator = getLineDownSeparator(fragments[child].getRawData(), fragments[master].getRawData());
                                if (!avoidSpaces || separator.indexOf(' ') == -1) {
                                    merge(fragments[master], fragments[child], separator);
                                }
                            }

                            //recalculate as may have changed
                            midX = (fragments[master].getX1() + fragments[master].getX2()) / 2;
                            midY = (fragments[master].getY1() + fragments[master].getY2()) / 2;

                        }
                    }
                }
            }

        }
    }

    /**
     * workout if we should use space, CR or no separator when joining lines
     */
    private static String getLineDownSeparator(final String rawLine1, final String rawLine2) {

        String returnValue = " "; //space is default

        final boolean hasUnderline = false;

        //get 2 lines without any XML or spaces so we can look at last char
        StringBuilder line1 = new StringBuilder(rawLine1);
        StringBuilder line2 = new StringBuilder(rawLine2);

        line1 = Strip.trim(line1);
        line2 = Strip.trim(line2);


        //get lengths and if appropriate perform tests
        final int line1Len = line1.length();
        final int line2Len = line2.length();

        if ((line1Len > 1) && (line2Len > 1)) {

            //get chars to test
            final char line1Char2 = line1.charAt(line1Len - 1);
            final char line1Char1 = line1.charAt(line1Len - 2);
            final char line2Char1 = line2.charAt(0);
            final char line2Char2 = line2.charAt(1);

            //deal with hyphenation first - ignore unless :- or space-
            final String hyphen_values = "";
            if (hyphen_values.indexOf(line1Char2) != -1) {
                returnValue = ""; //default of nothing
                if (line1Char1 == ':') {
                    returnValue = "\n";
                }
                if (line1Char2 == ' ') {
                    returnValue = " ";
                }
                //paragraph breaks if full stop and next line has ascii char or Capital Letter
            } else if (
                    ((line1Char1 == '.') || (line1Char2 == '.'))
                            && (Character.isUpperCase(line2Char1)
                            || (line2Char1 == '&')
                            || Character.isUpperCase(line2Char2)
                            || (line2Char2 == '&'))) {

                returnValue = "\n";
            }

        }

        //add an underline if appropriate
        if (hasUnderline) {
            returnValue += '\n';
        }

        return returnValue;
    }

    /**
     * general routine to see if we add a space between 2 text localFragments
     */
    private String isGapASpace(final int c, final int l, final float actualGap, final boolean addMultiplespaceXMLTag, final int writingMode) {
        String sep = "";
        float gap;

        //use smaller gap
        final float gapA = fragments[c].getSpaceWidth() * fragments[c].getFontSize();
        final float gapB = fragments[l].getSpaceWidth() * fragments[l].getFontSize();

        if (gapA > gapB) {
            gap = gapB;
        } else {
            gap = gapA;
        }

        gap = (actualGap / (gap / 1000));

        //Round values to closest full integer as float -> int conversion rounds down
        if (gap > 0.51f && gap < 1) {
            gap = 1;
        }

        final int spaceCount = (int) gap;

        if (spaceCount > 0) {
            sep = " ";
        }

        //add an XML tag to flag multiple spaces
        if (spaceCount > 1 && addMultiplespaceXMLTag && writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
            sep = " ";
        }

        return sep;
    }

    /**
     * convert localFragments into lines of text
     */
    @SuppressWarnings("unused")
    private void createLinesForSearch(final int mode, final boolean breakOnSpace, final boolean addMultiplespaceXMLTag, final boolean isSearch) throws PdfException {

        String separator;

        final boolean debug = false;

        //create local copies of arrays
        final Line[] localLines = fragments.clone();

//        final boolean[] isUsed = new boolean[lines.length];
        int finalCount = localLines.length;
        for (int i = 0; i != localLines.length; i++) {
            if (localLines[i].hasMerged) {
                finalCount--;
            }
        }

        //reverse order if text right to left
        if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
            for (int i = 0; i < localLines.length; i++) {
                localLines[i] = fragments[localLines.length - i - 1];
            }
        }

        //scan items joining best fit to right of each fragment to build lines.
        for (int master = 0; master < localLines.length; master++) {

            int id = -1;

            //float smallest_gap = -1, gap, yMidPt;
            if (!localLines[master].hasMerged() && localLines[master].getWritingMode() == mode) {

                if (debug) {
                    System.out.println("Look for match with " + removeHiddenMarkers(localLines[master].getRawData()));
                }

                for (int child = 0; child < localLines.length && id == -1; child++) {

                    /*
                     * Coordinates altered so x axis positive follows line direction 
                     * and y axis negative follows paragraph direction. 
                     * Coordinates in the order x1, y1, x2, y2
                    */
                    final float[] masterCoords = getCoordsForWritingMode(localLines[master], mode);
                    final float[] childCoords = getCoordsForWritingMode(localLines[child], mode);

                    if (!localLines[child].hasMerged() && master != child && localLines[master].getWritingMode() == localLines[child].getWritingMode() && childCoords[0] != childCoords[2]) {
                        if (debug) {
                            System.out.println("Checking " + removeHiddenMarkers(localLines[child].getRawData()));
                        }
                        //Get central points
                        final float mx = masterCoords[0] + ((masterCoords[2] - masterCoords[0]) / 2);
                        final float my = masterCoords[3] + ((masterCoords[1] - masterCoords[3]) / 2);
                        final float cx = childCoords[0] + ((childCoords[2] - childCoords[0]) / 2);
                        final float cy = childCoords[3] + ((childCoords[1] - childCoords[3]) / 2);

                        float smallestHeight = (masterCoords[1] - masterCoords[3]);
                        final float fontDifference = (childCoords[1] - childCoords[3]) - smallestHeight;
                        if (fontDifference < 0) {
                            smallestHeight = (childCoords[1] - childCoords[3]);
                        }

                        //Don't merge is font of 1 is twice the size
                        if (Math.abs(fontDifference) < smallestHeight * 2) {
                            //Check for the same line by checking the center of
                            //child is within master area
                            if (Math.abs(my - cy) < (smallestHeight * 0.5)) {
                                if (mx < cx) { //Child on right
                                    final float distance = childCoords[0] - masterCoords[2];
                                    if (distance <= smallestHeight / 2) {
                                        id = child;
                                    }
                                }
                            }
                        }
                        //Match has been found
                        if (id != -1) {
                            float possSpace = childCoords[0] - masterCoords[2];
                            if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
                                possSpace = -possSpace;
                            }

                            //add space if gap between this and last object
                            separator = isGapASpace(master, id, possSpace, addMultiplespaceXMLTag, mode);

                            //merge if adjoin
                            if (breakOnSpace && separator.startsWith(" ")) {
                                break;
                            }

                            if (debug) {
                                System.out.println("Merge items " + master + " & " + id);
                                System.out.println("c  : " + removeHiddenMarkers(localLines[master].getRawData()));
                                System.out.println("id : " + removeHiddenMarkers(localLines[id].getRawData()));
                                System.out.println("");
                            }

                            if ((isSearch && (child != master
                                    && ((childCoords[0] > masterCoords[0] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM)
                                    || (childCoords[0] < masterCoords[0] && mode == PdfData.VERTICAL_TOP_TO_BOTTOM)
                                    && localLines[master].getWritingMode() == mode)))
                                    || (!isSearch && (child != master && ((childCoords[0] > masterCoords[0] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM)
                                    || childCoords[0] < masterCoords[0] && mode == PdfData.VERTICAL_TOP_TO_BOTTOM && localLines[master].getWritingMode() == mode)))) { //see if on right

                                merge(localLines[master], localLines[id], separator);
                                finalCount--;


                            }

                            id = -1;
                        }
                    }
                }
            }
        }
        lines = new Line[finalCount];
        int next = 0;
        for (int i = 0; i != localLines.length; i++) {
            if (!localLines[i].hasMerged()) {
                lines[next] = localLines[i];
                next++;
            }
        }
    }

    private float[] getCoordsForWritingMode(final Line line, final int mode) throws PdfException {
        final float[] results = new float[4];
        //set pointers so left to right text
        switch (mode) {
            case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
                results[0] = line.getX1();
                results[2] = line.getX2();
                results[1] = line.getY1();
                results[3] = line.getY2();
                break;
            case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                results[2] = line.getX1();
                results[0] = line.getX2();
                results[1] = line.getY1();
                results[3] = line.getY2();
                break;
            case PdfData.VERTICAL_BOTTOM_TO_TOP:
                results[0] = line.getY2();
                results[2] = line.getY1();
                results[1] = line.getX2();
                results[3] = line.getX1();
                break;
            case PdfData.VERTICAL_TOP_TO_BOTTOM:
                results[0] = line.getY2();
                results[2] = line.getY1();
                results[3] = line.getX1();
                results[1] = line.getX2();
                break;
            default:
                throw new PdfException("Illegal value " + mode + " for currentWritingMode");
        }

        return results;
    }

    /**
     * merge 2 text localFragments together and update co-ordinates
     */
    private void merge(final Line master, final Line child, final String separator) {

        //update co-ords
        if (master.getX1() > child.getX1()) {
            master.setX1(child.getX1());
        }
        if (master.getY1() < child.getY1()) {
            master.setY1(child.getY1());
        }
        if (master.getX2() < child.getX2()) {
            master.setX2(child.getX2());
        }
        if (master.getY2() > child.getY2()) {
            master.setY2(child.getY2());
        }

        final String test = Fonts.fe;
        StringBuilder masterString = new StringBuilder(master.getRawData());
        final StringBuilder childString = new StringBuilder(child.getRawData());

        //move  if needed and add separator
        if ((masterString.toString().lastIndexOf(test) != -1)) {
            final String masterLocal = masterString.toString();
            masterString = new StringBuilder(masterLocal.substring(0, masterLocal.lastIndexOf(test)));
            masterString.append(separator);
            masterString.append(masterLocal.substring(masterLocal.lastIndexOf(test)));
        } else {
            masterString.append(separator);
        }

        //Only map out space if text length is longer than 1
        if (child.getTextLength() > 1 && masterString.toString().endsWith(" ")) {
            masterString.deleteCharAt(masterString.lastIndexOf(" "));
        }
        //use font size of second text (ie at end of merged text)
        master.setFontSize(child.getFontSize());

        //Remove excess / redundent xml tags
        if ((childString.indexOf("", masterString.lastIndexOf("") + 7 == masterString.lastIndexOf(">"))) {
            childString.replace(childString.indexOf("") + 1, "");
            masterString.replace(masterString.lastIndexOf(""), masterString.lastIndexOf("") + 8, "");
        }

        if ((childString.indexOf("", masterString.lastIndexOf("") + 6 == masterString.lastIndexOf(">"))) {
            childString.replace(childString.indexOf("") + 1, "");
            masterString.replace(masterString.lastIndexOf(""), masterString.lastIndexOf("") + 7, "");
        }

        masterString = masterString.append(childString);

        //track length of text less all tokens
        master.setTextLength(master.getTextLength() + child.getTextLength());

        //set objects to null to flush and log as used
        child.setRawData(null);
        child.setMerged(true);

        master.setRawData(masterString.toString());

//            //use font size of second text (ie at end of merged text)
//            master.setFontSize(child.getFontSize());
//
//            //add together
//            StringBuilder content = new StringBuilder();
//            content.append(master.getRawData()).append(separator).append(child.getRawData());
//            master.setRawData(content.toString());
//
//            //track length of text less all tokens
//            master.setTextLength(master.getTextLength()+child.getTextLength());
//
//            //set objects to null to flush and log as used
//            child.setRawData(null);
//            child.setMerged(true);
    }

    private void copyToArrays() {

        final int count = pdf_data.getRawTextElementCount();

        fragments = new Line[count];

        //set values
        for (int i = 0; i < count; i++) {
            fragments[i] = new Line(pdf_data, i);
        }
    }

    /**
     * sets if we include HTML in teasers
     * (do we want this is word or this is word as teaser)
     *
     * @param value True to include HTML, otherwise false
     */
    protected void setIncludeHTML(final boolean value) {
        includeHTMLtags = value;
    }

    /**
     * Flag if teasers should be generated whilst searching
     *
     * @param value True to generate teasers, otherwise false
     */
    protected void generateTeasers(final boolean value) {
        includeTease = value;
    }

    /**
     * Return flag to control teaser generation
     *
     * @return True if teasers are being generated, otherwise false
     */
    protected boolean isGeneratingTeasers() {
        return includeTease;
    }

    private class Line implements Comparable {
        private float x1, y1, x2, y2, character_spacing, spaceWidth;
        private String raw, currentColor;
        private int text_length, mode, fontSize;
        private boolean hasMerged;

        Line(final PdfData pdf_data, final int index) {
            loadData(pdf_data, index);
        }

        private void loadData(final PdfData pdf_data, final int index) {
            //extract values
            character_spacing = pdf_data.f_character_spacing[index];
            x1 = pdf_data.f_x1[index];
            x2 = pdf_data.f_x2[index];
            y1 = pdf_data.f_y1[index];
            y2 = pdf_data.f_y2[index];
            currentColor = pdf_data.colorTag[index];
            text_length = pdf_data.text_length[index];
            mode = pdf_data.f_writingMode[index];
            raw = pdf_data.contents[index];
            fontSize = pdf_data.f_end_font_size[index];
            spaceWidth = pdf_data.space_width[index];
            hasMerged = false;
        }

        protected float getX1() {
            return x1;
        }

        protected float getY1() {
            return y1;
        }

        protected float getX2() {
            return x2;
        }

        protected float getY2() {
            return y2;
        }

        protected float getCharacterSpacing() {
            return character_spacing;
        }

        protected float getSpaceWidth() {
            return spaceWidth;
        }

        protected String getRawData() {
            return raw;
        }

        protected String getColorTag() {
            return currentColor;
        }

        protected int getWritingMode() {
            return mode;
        }

        protected int getTextLength() {
            return text_length;
        }

        protected int getFontSize() {
            return fontSize;
        }

        protected boolean hasMerged() {
            return hasMerged;
        }

        protected void setX1(final float value) {
            x1 = value;
        }

        protected void setY1(final float value) {
            y1 = value;
        }

        protected void setX2(final float value) {
            x2 = value;
        }

        protected void setY2(final float value) {
            y2 = value;
        }

        protected void setFontSize(final int value) {
            fontSize = value;
        }

        protected void setRawData(final String value) {
            raw = value;
        }

        protected void setTextLength(final int value) {
            text_length = value;
        }

        protected void setMerged(final boolean value) {
            hasMerged = value;
        }

        @Override
        public int compareTo(final Line o) {
            switch (mode) {
                case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
                case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                    return (int) (y1 - o.getY1());
                case PdfData.VERTICAL_TOP_TO_BOTTOM:
                case PdfData.VERTICAL_BOTTOM_TO_TOP:
                    return (int) (x1 - o.getX1());
            }
            return 0;
        }
    }
}