org.jpedal.grouping.PdfTextExtractionUtils Maven / Gradle / Ivy

/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/support/
 *
 * (C) Copyright 1997-2017 IDRsolutions and Contributors.
 *
 * This file is part of JPedal/JPDF2HTML5
 *
 @LICENSE@
 *
 * ---------------
 * PdfTextExtractionUtils.java
 * ---------------
 */
package org.jpedal.grouping;

import java.awt.Rectangle;
import java.util.*;

import org.jpedal.color.GenericColorSpace;
import org.jpedal.exception.PdfException;
import org.jpedal.objects.PdfData;
import org.jpedal.utils.Fonts;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Sorts;
import org.jpedal.utils.Strip;
import org.jpedal.utils.repositories.*;

/**
 * Applies heuristics to unstructured PDF text to create content
 */
public class PdfTextExtractionUtils {

    private static final String SYSTEM_LINE_SEPARATOR = System.getProperty("line.separator");

    //flag to show this item has been merged into another and should be ignored.
    private boolean[] isUsed;

    //co-ords of object (x1,y1 is top left)
    private float[] f_x1, f_x2, f_y1, f_y2;

    //track if we removed space from end
    private boolean[] hadSpace;

    //hold colour info
    private String[] f_colorTag;

    //hold writing mode
    private int[] writingMode;

    //font sizes in pixels
    private int[] fontSize;

    //amount of space a space uses in this font/size
    private float[] spaceWidth;

    //actual text
    private StringBuilder[] content;

    //raw number of text characters
    private int[] textLength;

    //Hold data from pdf so we can create local version
    private final PdfData pdf_data;

    //flag to show if output for table is CSV or XHTML
    private boolean isXHTML = true;

    //slot to insert next value - used when we split fragments for table code
    private int nextSlot;

    //vertical breaks for table calculation
    private Vector_Int lineBreaks = new Vector_Int();

    //holds details as we scan lines for table
    private Vector_Object lines;

    //lookup table used to sort into correct order for table
    private Vector_Int lineY2;

    //counters for cols and rows and pointer to final object we merge into
    private int max_rows, master;

    //flag to show color info is being extracted
    private boolean colorExtracted;

    //used to calculate correct order for table lines
    private int[] line_order;

    //amount we resize arrays holding content with if no space
    private static final int CONTENT_ARRAY_INCREMENT = 100;

    private boolean isXMLExtraction = true;

    /**
     * Create a new instance, passing in raw data
     *
     * @param pdf_data        PdfData from the pdf to search
     * @param isXMLExtraction Boolean flag to specify if output should be xml
     */
    protected PdfTextExtractionUtils(final PdfData pdf_data, final boolean isXMLExtraction) {
        this.pdf_data = pdf_data;
        this.isXMLExtraction = isXMLExtraction;
        colorExtracted = pdf_data.isColorExtracted();
    }

    /**
     * workout if we should use space, CR or no separator when joining lines
     */
    private static String getLineDownSeparator(final StringBuilder rawLine1, final StringBuilder rawLine2, final boolean isXMLExtraction) {

        String returnValue = " "; //space is default

        final boolean hasUnderline = false;

        //get 2 lines without any XML or spaces so we can look at last char
        final StringBuilder line1;
        final StringBuilder line2;
        if (isXMLExtraction) {
            line1 = Strip.stripXML(rawLine1, isXMLExtraction);
            line2 = Strip.stripXML(rawLine2, isXMLExtraction);
        } else {
            line1 = Strip.trim(rawLine1);
            line2 = Strip.trim(rawLine2);
        }

        //get lengths and if appropriate perform tests
        final int line1Len = line1.length();
        final int line2Len = line2.length();

        if ((line1Len > 1) && (line2Len > 1)) {

            //get chars to test
            final char line1Char2 = line1.charAt(line1Len - 1);
            final char line1Char1 = line1.charAt(line1Len - 2);
            final char line2Char1 = line2.charAt(0);
            final char line2Char2 = line2.charAt(1);

            //deal with hyphenation first - ignore unless :- or space-
            final String hyphen_values = "";
            if (hyphen_values.indexOf(line1Char2) != -1) {
                returnValue = ""; //default of nothing
                if (line1Char1 == ':') {
                    returnValue = "\n";
                }
                if (line1Char2 == ' ') {
                    returnValue = " ";
                }
                //paragraph breaks if full stop and next line has ascii char or Capital Letter
            } else if (
                    ((line1Char1 == '.') || (line1Char2 == '.'))
                            && (Character.isUpperCase(line2Char1)
                            || (line2Char1 == '&')
                            || Character.isUpperCase(line2Char2)
                            || (line2Char2 == '&'))) {
                if (isXMLExtraction) {
                    returnValue = "\n";
                } else {
                    returnValue = "\n";
                }
            }

        }

        //add an underline if appropriate
        if (hasUnderline) {
            if (isXMLExtraction) {
                returnValue += "\n";
            } else {
                returnValue += '\n';
            }
        }

        return returnValue;
    }

    /**
     * remove shadows from text created by double printing of text and drowned
     * items where text inside other text
     */
    private void cleanupShadowsAndDrownedObjects(final boolean avoidSpaces) {

        //get list of items
        final int[] items = getUnusedFragments();
        final int count = items.length;
        int c, n;
        String separator;
        float diff;

        //work through objects and eliminate shadows or roll together overlaps
        for (int p = 0; p < count; p++) {

            //master item
            c = items[p];

            //ignore used items
            if (!isUsed[c]) {

                //work out mid point in text
                float midX = (f_x1[c] + f_x2[c]) / 2;
                float midY = (f_y1[c] + f_y2[c]) / 2;

                for (int p2 = p + 1; p2 < count; p2++) {

                    //item to test against
                    n = items[p2];

                    //Ignore fragments that have been used or have no width
                    if ((f_x1[n] != f_x2[n]) && (!isUsed[n]) && (!isUsed[c])) {

                        float fontDiff = this.fontSize[n] - fontSize[c];
                        if (fontDiff < 0) {
                            fontDiff = -fontDiff;
                        }

                        diff = (f_x2[n] - f_x1[n]) - (f_x2[c] - f_x1[c]);
                        if (diff < 0) {
                            diff = -diff;
                        }

                        //stop spurious matches on overlapping text
                        if (fontDiff == 0 && (midX > f_x1[n]) && (midX < f_x2[n])
                                && (diff < 10)
                                && (midY < f_y1[n]) && (midY > f_y2[n])) {

                            isUsed[n] = true;

                            //pick up drowned text items (item inside another)			
                        } else {

                            final boolean a_in_b =
                                    (f_x1[n] > f_x1[c]) && (f_x2[n] < f_x2[c])
                                            && (f_y1[n] < f_y1[c]) && (f_y2[n] > f_y2[c]);
                            final boolean b_in_a =
                                    (f_x1[c] > f_x1[n]) && (f_x2[c] < f_x2[n])
                                            && (f_y1[c] < f_y1[n]) && (f_y2[c] > f_y2[n]);

                            //merge together
                            if (a_in_b || b_in_a) {
                                //get order right - bottom y2 underneath
                                if (f_y2[c] > f_y2[n]) {
                                    separator = getLineDownSeparator(content[c], content[n], isXMLExtraction);
                                    if ((!avoidSpaces) || (separator.indexOf(' ') == -1)) {
                                        merge(c, n, separator, true);
                                    }
                                } else {
                                    separator = getLineDownSeparator(content[n], content[c], isXMLExtraction);
                                    if (!avoidSpaces || separator.indexOf(' ') == -1) {
                                        merge(n, c, separator, true);
                                    }
                                }

                                //recalculate as may have changed
                                midX = (f_x1[c] + f_x2[c]) / 2;
                                midY = (f_y1[c] + f_y2[c]) / 2;

                            }
                        }
                    }
                }
            }
        }
    }

    /**
     * general routine to see if we add a space between 2 text fragments
     */
    private String isGapASpace(final int c, final int l, final float actualGap, final boolean addMultiplespaceXMLTag, final int writingMode) {
        String sep = "";
        float gap;

        //use smaller gap
        final float gapA = spaceWidth[c] * fontSize[c];
        final float gapB = spaceWidth[l] * fontSize[l];

        if (gapA > gapB) {
            gap = gapB;
        } else {
            gap = gapA;
        }

        gap = (actualGap / (gap / 1000));

        //Round values to closest full integer as float -> int conversion rounds down
        if (gap > 0.51f && gap < 1) {
            gap = 1;
        }

        final int spaceCount = (int) gap;

        if (spaceCount > 0) {
            sep = " ";
        }

        //add an XML tag to flag multiple spaces
        if (spaceCount > 1 && addMultiplespaceXMLTag && writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
            sep = " ";
        }

        return sep;
    }

    /**
     * merge 2 text fragments together and update co-ordinates
     */
    private void merge(final int m, final int c, final String separator, final boolean moveFont) {

        //update co-ords
        if (f_x1[m] > f_x1[c]) {
            f_x1[m] = f_x1[c];
        }
        if (f_y1[m] < f_y1[c]) {
            f_y1[m] = f_y1[c];
        }
        if (f_x2[m] < f_x2[c]) {
            f_x2[m] = f_x2[c];
        }
        if (f_y2[m] > f_y2[c]) {
            f_y2[m] = f_y2[c];
        }

        if (isXMLExtraction) {
            String test = Fonts.fe;

            //add color tag if needed and changes
            if (colorExtracted) {
                test = Fonts.fe + GenericColorSpace.ce;
            }

            //move  if needed and add separator
            if ((moveFont) && (content[m].toString().lastIndexOf(test) != -1)) {
                final String masterContent = content[m].toString();
                content[m] = new StringBuilder(masterContent.substring(0, masterContent.lastIndexOf(test)));
                content[m].append(separator);
                content[m].append(masterContent.substring(masterContent.lastIndexOf(test)));
            } else {
                content[m].append(separator);
            }

            //Only map out space if text length is longer than 1
            if (textLength[c] > 1 && content[m].toString().endsWith(" ")) {
                content[m].deleteCharAt(content[m].lastIndexOf(" "));
            }
            //use font size of second text (ie at end of merged text)
            fontSize[m] = fontSize[c];

            //Remove excess / redundent xml tags
            if ((content[c].indexOf("", content[m].lastIndexOf("") + 7 == content[m].lastIndexOf(">"))) {
                content[c].replace(content[c].indexOf("") + 1, "");
                content[m].replace(content[m].lastIndexOf(""), content[m].lastIndexOf("") + 8, "");
            }

            if ((content[c].indexOf("", content[m].lastIndexOf("") + 6 == content[m].lastIndexOf(">"))) {
                content[c].replace(content[c].indexOf("") + 1, "");
                content[m].replace(content[m].lastIndexOf(""), content[m].lastIndexOf("") + 7, "");
            }

            content[m] = content[m].append(content[c]);

            //track length of text less all tokens
            textLength[m] += textLength[c];

            //set objects to null to flush and log as used
            isUsed[c] = true;
            content[c] = null;
        } else {

            //use font size of second text (ie at end of merged text)
            fontSize[m] = fontSize[c];

            //add together
            content[m] = content[m].append(separator).append(content[c]);

            //track length of text less all tokens
            textLength[m] += textLength[c];

            //set objects to null to flush and log as used
            isUsed[c] = true;
            content[c] = null;
        }
    }

    /**
     * remove width data we may have buried in data
     */
    private void removeEncoding() {

        // get list of items
        final int[] items = getUnusedFragments();
        int current;

        // work through objects and eliminate shadows or roll together overlaps
        for (final int item : items) {

            // masterIndex item
            current = item;

            // ignore used items and remove widths we hid in data
            if (!isUsed[current]) {
                content[current] = removeHiddenMarkers(current);
            }
        }
    }

    /**
     * put raw data into Arrays for quick merging breakup_fragments shows if we
     * break on vertical lines and spaces
     */
    private void copyToArrays() {

        colorExtracted = pdf_data.isColorExtracted();

        final int count = pdf_data.getRawTextElementCount();

        //local lists for faster access
        isUsed = new boolean[count];
        fontSize = new int[count];
        writingMode = new int[count];
        spaceWidth = new float[count];
        content = new StringBuilder[count];
        textLength = new int[count];

        f_x1 = new float[count];
        f_colorTag = new String[count];
        f_x2 = new float[count];
        f_y1 = new float[count];
        f_y2 = new float[count];

        //set values
        for (int i = 0; i < count; i++) {
            content[i] = new StringBuilder(pdf_data.contents[i]);

            fontSize[i] = pdf_data.f_end_font_size[i];
            writingMode[i] = pdf_data.f_writingMode[i];
            f_x1[i] = pdf_data.f_x1[i];
            f_colorTag[i] = pdf_data.colorTag[i];
            f_x2[i] = pdf_data.f_x2[i];
            f_y1[i] = pdf_data.f_y1[i];
            f_y2[i] = pdf_data.f_y2[i];

            spaceWidth[i] = pdf_data.space_width[i];
            textLength[i] = pdf_data.text_length[i];
        }
    }

    /**
     * get list of unused fragments and put in list
     */
    private int[] getUnusedFragments() {
        final int total_fragments = isUsed.length;

        //get unused item pointers
        int ii = 0;
        final int[] temp_index = new int[total_fragments];
        for (int i = 0; i < total_fragments; i++) {
            if (!isUsed[i]) {
                temp_index[ii] = i;
                ii++;
            }
        }

        //put into correctly sized array
        final int[] items = new int[ii];
        System.arraycopy(temp_index, 0, items, 0, ii);
        return items;
    }


    /**
     * strip the hidden numbers of position we encoded into the data
     * (could be coded to be faster by not using Tokenizer)
     */
    private StringBuilder removeHiddenMarkers(final int c) {

        final String marker = PdfData.marker;

        //make sure has markers and ignore if not
        if (content[c].indexOf(marker) == -1) {
            return content[c];
        }

        //strip the markers
        final StringTokenizer tokens = new StringTokenizer(content[c].toString(), marker, true);
        String temp;
        StringBuilder processedData = new StringBuilder();

        //with a token to make sure cleanup works
        while (tokens.hasMoreTokens()) {

            //strip encoding in data
            temp = tokens.nextToken(); //see if first marker

            if (temp.equals(marker)) {
                tokens.nextToken(); //point character starts
                tokens.nextToken(); //second marker
                tokens.nextToken(); //width
                tokens.nextToken(); //third marker

                //put back chars
                processedData = processedData.append(tokens.nextToken());

            } else {
                processedData = processedData.append(temp);
            }
        }

        return processedData;
    }

    /**
     * Method to try and find vertical lines in close data
     * (not as efficient as it could be)
     *
     * @throws PdfException
     */
    private void findVerticalLines(final float minX, final float minY, final float maxX, final float maxY, final int currentWritingMode) throws PdfException {

        //hold counters on all x values
        final HashMap xLines = new HashMap();

        final String marker = PdfData.marker;

        //counter on most popular item
        int most_frequent = 0;
        final int count = pdf_data.getRawTextElementCount();
        float x1, x2, y1, y2;
        String raw;

        for (int i = 0; i < count; i++) {
            float currentX = 0, lastX;
            Integer intX;

            //extract values for data
            raw = this.pdf_data.contents[i];

            //set pointers so left to right text
            switch (currentWritingMode) {
                case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
                    x1 = this.f_x1[i];
                    x2 = this.f_x2[i];
                    y1 = this.f_y1[i];
                    y2 = this.f_y2[i];
                    break;
                case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                    x2 = this.f_x1[i];
                    x1 = this.f_x2[i];
                    y1 = this.f_y1[i];
                    y2 = this.f_y2[i];
                    break;
                case PdfData.VERTICAL_BOTTOM_TO_TOP:
                    x1 = this.f_y1[i];
                    x2 = this.f_y2[i];
                    y1 = this.f_x2[i];
                    y2 = this.f_x1[i];
                    break;
                case PdfData.VERTICAL_TOP_TO_BOTTOM:
                    x1 = this.f_y2[i];
                    x2 = this.f_y1[i];
                    y2 = this.f_x1[i];
                    y1 = this.f_x2[i];
                    break;
                default:
                    throw new PdfException("Illegal value " + currentWritingMode + "for currentWritingMode");
            }

            //if in the area, process
            if ((x1 > minX - .5) && (x2 < maxX + .5) && (y2 > minY - .5) && (y1 < maxY + .5)) {

                //run though the string extracting our markers to get x values
                final StringTokenizer tokens = new StringTokenizer(raw, marker, true);
                String value, lastValue = "";
                Object currentValue;

                while (tokens.hasMoreTokens()) {

                    //encoding in data
                    value = tokens.nextToken(); //see if first marker
                    if (value.equals(marker)) {

                        value = tokens.nextToken(); //point character starts

                        if (!value.isEmpty()) {

                            lastX = currentX;
                            currentX = Float.parseFloat(value);
                            try {

								/* 
                                 * add x to list or increase counter at start
								 * or on space
								 * add points either side of space
                                 */
                                if (lastValue.isEmpty() || (lastValue.indexOf(' ') != -1)) {

                                    intX = (int) currentX;
                                    currentValue = xLines.get(intX);
                                    if (currentValue == null) {
                                        xLines.put(intX, 1);
                                    } else {
                                        int countReached = (Integer) currentValue;
                                        countReached++;

                                        if (countReached > most_frequent) {
                                            most_frequent = countReached;
                                        }

                                        xLines.put(intX, countReached);
                                    }

                                    //work out the middle
                                    final int middle = (int) (lastX + ((currentX - lastX) / 2));

                                    if (lastX != 0) {
                                        intX = middle;
                                        currentValue = xLines.get(intX);
                                        if (currentValue == null) {
                                            xLines.put(intX, 1);
                                        } else {
                                            int count_reached = (Integer) currentValue;
                                            count_reached++;

                                            if (count_reached > most_frequent) {
                                                most_frequent = count_reached;
                                            }

                                            xLines.put(intX, count_reached);
                                        }
                                    }
                                }

                            } catch (final Exception e) {
                                LogWriter.writeLog(
                                        "Exception " + e + " stripping x values");
                            }
                        }

                        tokens.nextToken(); //second marker
                        tokens.nextToken(); //glyph  width
                        tokens.nextToken(); //third marker
                        value = tokens.nextToken(); //put back chars
                        lastValue = value;

                    }
                }
            }
        }

        //now analyse the data
        final Iterator keys = xLines.keySet().iterator();
        final int minimum_needed = most_frequent / 2;

        while (keys.hasNext()) {
            final Integer current_key = keys.next();
            final int current_count = xLines.get(current_key);

            if (current_count > minimum_needed) {
                lineBreaks.addElement(current_key);
            }

        }
    }

    private boolean isFragmentWithinArea(final Fragment fragment, final float minX, final float minY, final float maxX, final float maxY) {

        //if at least partly in the area, process
        if ((fragment.getWritingMode() == PdfData.HORIZONTAL_LEFT_TO_RIGHT || fragment.getWritingMode() == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) {

            final float textHeight = fragment.getY1() - fragment.getY2();

            if (((fragment.getY2() > minY && fragment.getY1() < maxY) || //Check is file is enclosed by highlight
                    ((fragment.getY2() > minY && maxY - fragment.getY2() > textHeight * 0.5) || (fragment.getY1() < maxY && fragment.getY1() - minY > textHeight * 0.5))) && //Check is partially enclosed by highlight
                    (minX + maxX) > 0 && //Widt of area is not 0. This case was found in torture.pdf
                    !(fragment.getX2() < minX) && !(fragment.getX1() > maxX)) { //Ensure the text is within the x axis
                return true;
            }
        } else if (((fragment.getWritingMode() == PdfData.VERTICAL_BOTTOM_TO_TOP || fragment.getWritingMode() == PdfData.VERTICAL_TOP_TO_BOTTOM))
                && (fragment.getX1() > minX && fragment.getX2() < maxX && fragment.getY1() > minY && fragment.getY2() < maxY)) {
            return true;
        }
        return false;
    }

    /**
     * Method splitFragments adds raw fragments to processed fragments breaking
     * up any with vertical lines through or what looks like tabbed spaces
     *
     * @throws PdfException
     */
    private void copyToArrays(
            final float minX, final float minY, final float maxX, final float maxY,
            final boolean keepFont, final boolean breakOnSpace, final boolean findLines, final String punctuation, final boolean isWordlist) throws PdfException {

        final boolean debugSplit = false;

        final char MARKER2 = PdfGroupingAlgorithms.MARKER2;

        //initialise local arrays allow for extra space
        int count = pdf_data.getRawTextElementCount() + CONTENT_ARRAY_INCREMENT;

        initArrays(count);

        //flag to find lines based on orientation of first text item*/
        boolean linesScanned = false;

        //set defaults and calculate dynamic values
        count -= CONTENT_ARRAY_INCREMENT;
        float last_pt, min, max, pt, linePos;
        String char_width = "";
        StringBuilder text = new StringBuilder();
        Fragment fragment;
        //work through fragments
        for (int i = 0; i < count; i++) {

            fragment = new Fragment(pdf_data, i);

//            if (debugSplit) {
//                System.out.println("raw data=" + fragment.getRawData());
//                System.out.println("text data=" + PdfTextExtractionUtils.removeHiddenMarkers(fragment.getRawData()));
//            }

            if (isFragmentWithinArea(fragment, minX, minY, maxX, maxY)) {

                //look for possible vertical or horizontal lines in the data
                if (!linesScanned && findLines) {
                    findVerticalLines(minX, minY, maxX, maxY, fragment.getWritingMode());
                    linesScanned = true;
                }

                //initialise pointers and work out an 'average character space'
                if (fragment.getWritingMode() == PdfData.HORIZONTAL_LEFT_TO_RIGHT || fragment.getWritingMode() == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
                    //space = (coords[2] - coords[0]) / text_length;
                    pt = fragment.getX1();
                    last_pt = fragment.getX1();
                    min = minX;
                    max = maxX;
                } else { //vertical text
                    //space = (coords[1] - coords[3]) / text_length;
                    pt = fragment.getY2();
                    last_pt = fragment.getY2();
                    min = minY;
                    max = maxY;
                }

                linePos = -1;

                //work through text, using embedded markers to find in area
                final char[] line = fragment.getRawData().toCharArray();

                final int end = line.length;
                int pointer = 0;

                String value, textValue = "", pt_reached;

                //allow for no tokens and return all text fragment
                if (!fragment.getRawData().contains(PdfData.marker)) {
                    text = new StringBuilder(fragment.getRawData());
                }

                boolean isFirstValue = true, breakPointset = false;

                //work through text, using embedded markers to find in area
                while (pointer < end) {

                    //only data between min and y locations
                    while (true) {

                        if (line[pointer] != MARKER2) {
                            //find second marker and get width
                            final int startPointer = pointer;
                            while ((pointer < end) && (line[pointer] != MARKER2)) {
                                pointer++;
                            }
                            value = fragment.getRawData().substring(startPointer, pointer);

                        } else { //if (value.equals(MARKER)) { // read the next token and its location and width

                            //find first marker
                            while ((pointer < end) && (line[pointer] != MARKER2)) {
                                pointer++;
                            }

                            pointer++;

                            //find second marker and get width
                            int startPointer = pointer;
                            while ((pointer < end) && (line[pointer] != MARKER2)) {
                                pointer++;
                            }
                            pt_reached = fragment.getRawData().substring(startPointer, pointer);
                            pointer++;

                            //find third marker
                            startPointer = pointer;
                            while ((pointer < end) && (line[pointer] != MARKER2)) {
                                pointer++;
                            }

                            char_width = fragment.getRawData().substring(startPointer, pointer);
                            pointer++;

                            //find next marker
                            startPointer = pointer;
                            while ((pointer < end) && (line[pointer] != MARKER2)) {
                                pointer++;
                            }

                            value = fragment.getRawData().substring(startPointer, pointer);

                            textValue = value; //keep value with no spaces

                            if (!pt_reached.isEmpty()) { //set point character starts
                                last_pt = pt;
                                pt = Float.parseFloat(pt_reached);

                                if (breakPointset) {
                                    alterCoordsBasedOnWritingMode(fragment, pt);
                                    breakPointset = false;
                                }
                            }

                            //add font start if needed
                            if (isXMLExtraction && last_pt < min && pt > min && !value.startsWith(Fonts.fb)) {
                                value = Fonts.getActiveFontTag(fragment.getRawData(), "") + value;
                            }

                        }
                        if (!char_width.isEmpty()) {
                            final float midPoint = pt + (Float.parseFloat(char_width) * 0.3f);
                            if ((midPoint > min) & (midPoint < max)) {
                                setFragmentCoord(fragment, min, max, pt);

                                break;
                            }
                        }

                        value = "";
                        textValue = "";

                        if (pointer >= end) {
                            break;
                        }
                    }

                    //make sure font not sliced off on first value
                    if (isFirstValue) {

                        isFirstValue = false;
                        if ((isXMLExtraction) && (keepFont) && (!value.startsWith(Fonts.fb)) && (!value.startsWith(GenericColorSpace.cb))) { //&&(!text.toString().startsWith(Fonts.fb))))
                            text.append(Fonts.getActiveFontTag(text.toString(), fragment.getRawData()));
                        }
                    }

                    //see if a break occurs
                    boolean is_broken = false;
                    if (findLines && fragment.getCharacterSpacing() > 0 && text.toString().endsWith(" ")) {
                        final int counts = lineBreaks.size();
                        for (int jj = 0; jj < counts; jj++) {
                            final int test_x = lineBreaks.elementAt(jj);
                            if ((last_pt < test_x) & (pt > test_x)) {
                                jj = counts;
                                is_broken = true;
                            }
                        }
                    }

                    final boolean endsWithPunctuation = checkForPunctuation(textValue, punctuation);

                    if (is_broken) { //break on double-spaces or larger

                        text = writeOutFragment(keepFont, isWordlist, debugSplit, last_pt, pt, char_width, text, fragment, i, end, value);

                    } else if (endsWithPunctuation
                            || (breakOnSpace && (textValue.indexOf(' ') != -1 || value.endsWith(" "))) || textValue.contains("   ")) { //break on double-spaces or larger
                        if (debugSplit) {
                            System.out.println("Break 2 endsWithPunctuation=" + endsWithPunctuation + " textValue=" + textValue + '<' + " value=" + value + '<' + " text=" + text + '<');
                        }

                        pt = writeOut(keepFont, isWordlist, debugSplit, pt, char_width, text, fragment, i, value, textValue, endsWithPunctuation);

                        if (!char_width.isEmpty()) { //add in space values to start of next shape
                            //count the spaces
                            int ptr = 0;

                            if (textValue.indexOf(' ') != -1) {
                                ptr = textValue.indexOf(' ');
                            }

                            if (isWordlist) {
                                final int len = textValue.length();
                                while (ptr < len && textValue.charAt(ptr) == ' ') {
                                    ptr++;
                                }
                            }

                            if (ptr > 0) {
                                pt += ptr * Float.parseFloat(char_width);
                            } else {
                                pt += Float.parseFloat(char_width);
                            }

                            breakPointset = ptr > 0;

                        }

                        //store fact it had a space in case we generate wordlist
                        if ((breakOnSpace) & (nextSlot > 0)) {
                            hadSpace[nextSlot - 1] = true;
                        }

                        text = new StringBuilder(Fonts.getActiveFontTag(text.toString(), fragment.getRawData()));
                        alterCoordsBasedOnWritingMode(fragment, pt);

                    } else if ((linePos != -1) & (pt > linePos)) { //break on a vertical line

                        text = writeOnVerticalLineBreak(keepFont, isWordlist, linePos, text, fragment, i, value);

                        linePos = -1;

                    } else { //allow for space used as tab
                        if ((isXMLExtraction) && (value.endsWith(' ' + Fonts.fe))) {
                            value = Fonts.fe;
                            textValue = "";

                            alterCoordsBasedOnWritingMode(fragment, last_pt);
                        }
                        text.append(value);
                    }

                }

                //trap scenario we found if all goes through with no break at end
                if (keepFont && isXMLExtraction && !text.toString().endsWith(Fonts.fe)
                        && !text.toString().endsWith(GenericColorSpace.ce)) {
                    text.append(Fonts.fe);
                }

                //create new line with what is left and output
                completeLine(keepFont, isWordlist, text, fragment, i);

                text = new StringBuilder();

            }
        }

        //local lists for faster access
        isUsed = new boolean[nextSlot];

    }

    private void completeLine(final boolean keepFont, final boolean isWordlist, final StringBuilder text, final Fragment fragment, final int i) {
        if (fragment.getWritingMode() == PdfData.HORIZONTAL_LEFT_TO_RIGHT || fragment.getWritingMode() == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
            if (fragment.getX1() < fragment.getX2()) {
                addFragment(i, text, fragment.getX1(), fragment.getX2(), fragment.getY1(), fragment.getY2(), keepFont, fragment, isWordlist);
            } else {
                addFragment(i, text, fragment.getX2(), fragment.getX1(), fragment.getY1(), fragment.getY2(), keepFont, fragment, isWordlist);
            }
        } else if ((fragment.getWritingMode() == PdfData.VERTICAL_BOTTOM_TO_TOP || fragment.getWritingMode() == PdfData.VERTICAL_TOP_TO_BOTTOM) && (fragment.getY1() > fragment.getY2())) {
            addFragment(i, text, fragment.getX1(), fragment.getX2(), fragment.getY1(), fragment.getY2(), keepFont, fragment, isWordlist);
        }
    }

    private StringBuilder writeOnVerticalLineBreak(final boolean keepFont, final boolean isWordlist, final float linePos, StringBuilder text, final Fragment fragment, final int i, final String value) {
        switch (fragment.getWritingMode()) {
            case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
                addFragment(i, text, fragment.getX1(), linePos, fragment.getY1(), fragment.getY2(), keepFont, fragment, isWordlist);
                break;
            case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                addFragment(i, text, linePos, fragment.getX2(), fragment.getY1(), fragment.getY2(), keepFont, fragment, isWordlist);
                break;
            case PdfData.VERTICAL_BOTTOM_TO_TOP:
                addFragment(i, text, fragment.getX1(), fragment.getX2(), linePos, fragment.getY2(), keepFont, fragment, isWordlist);
                break;
            case PdfData.VERTICAL_TOP_TO_BOTTOM:
                addFragment(i, text, fragment.getX1(), fragment.getX2(), fragment.getY1(), linePos, keepFont, fragment, isWordlist);
                break;
            default:
                break;
        }

        text = new StringBuilder(Fonts.getActiveFontTag(text.toString(), fragment.getRawData()));
        text.append(value);

        alterCoordsBasedOnWritingMode(fragment, linePos);
        return text;
    }

    private float writeOut(final boolean keepFont, final boolean isWordlist, final boolean debugSplit, float pt, final String char_width, final StringBuilder text, final Fragment fragment, final int i, final String value, final String textValue, final boolean endsWithPunctuation) {
        //Remove final bit of the below if to fix issue in case 11542
        if (textValue.length() > 1 && textValue.indexOf(' ') != -1) { // && fragment.getX1()==pt){ //add in space values to start of next shape
            //count the spaces
            final int ptr = textValue.indexOf(' ');

            if (ptr > 0) {

                if (textValue.length() == 2) {
                    /*
                     * Catch for issue 22796.
                     * This is an estimate, the only other way to achieve this is
                     * to get the font for each bit of text so we can determine 
                     * the length of the characters used to be 100% accurate but 
                     * the font data is not accessible from here.
                     */
                    pt += Float.parseFloat(char_width) * 0.7;
                } else {
                    pt += ptr * (Float.parseFloat(char_width) / textValue.length());
                }
            }
        }

        if (!endsWithPunctuation) {
            text.append(value.trim());
        }

        switch (fragment.getWritingMode()) {
            case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
                if (debugSplit) {
                    System.out.println("Add " + fragment.getX1() + ' ' + pt + " text=" + text + " i=" + i);
                }
                addFragment(i, text, fragment.getX1(), pt, fragment.getY1(), fragment.getY2(), keepFont, fragment, isWordlist);
                break;
            case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                if (debugSplit) {
                    System.out.println("b");
                }
                addFragment(i, text, pt, fragment.getX2(), fragment.getY1(), fragment.getY2(), keepFont, fragment, isWordlist);
                break;
            case PdfData.VERTICAL_BOTTOM_TO_TOP:
                if (debugSplit) {
                    System.out.println("c");
                }
                addFragment(i, text, fragment.getX1(), fragment.getX2(), pt, fragment.getY2(), keepFont, fragment, isWordlist);
                break;
            case PdfData.VERTICAL_TOP_TO_BOTTOM:
                if (debugSplit) {
                    System.out.println("d");
                }
                addFragment(i, text, fragment.getX1(), fragment.getX2(), fragment.getY1(), pt, keepFont, fragment, isWordlist);
                break;
            default:
                break;
        }
        return pt;
    }

    private StringBuilder writeOutFragment(final boolean keepFont, final boolean isWordlist, final boolean debugSplit, final float last_pt, final float pt, final String char_width, StringBuilder text, final Fragment fragment, final int i, final int end, final String value) {
        if (debugSplit) {
            System.out.println("Break 1 is_broken");
        }

        final Fragment temp = new Fragment(pdf_data, end);
        temp.setX1(fragment.getX1());
        temp.setY1(fragment.getY1());
        temp.setX2(fragment.getX2());
        temp.setY2(fragment.getY2());
        alterCoordsBasedOnWritingMode(temp, last_pt + Float.parseFloat(char_width));

        addFragment(i, text, temp.getX1(), temp.getX2(), temp.getY1(), temp.getY2(), keepFont, fragment, isWordlist);
        text = new StringBuilder(Fonts.getActiveFontTag(text.toString(), fragment.getRawData()));
        text.append(value);

        alterCoordsBasedOnWritingMode(fragment, pt);
        return text;
    }

    private void setFragmentCoord(final Fragment fragment, final float min, final float max, final float pt) {
        if (fragment.getWritingMode() == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
            if ((fragment.getX1() < min || fragment.getX1() > max) && pt >= min) {
                fragment.setX1(pt);
            } else if (fragment.getWritingMode() == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
                if ((fragment.getX2() > max || fragment.getX2() < min) && pt <= max) {
                    fragment.setX2(pt);
                } else if (fragment.getWritingMode() == PdfData.VERTICAL_BOTTOM_TO_TOP) {
                    if ((fragment.getY2() < min || fragment.getY2() > max) && pt >= min) {
                        fragment.setY2(pt);
                    } else if ((fragment.getWritingMode() == PdfData.VERTICAL_TOP_TO_BOTTOM)
                            && ((fragment.getY1() < min || fragment.getY1() > max) && pt <= min)) {
                        fragment.setY1(pt);
                    }
                }
            }
        }
    }

    private static void alterCoordsBasedOnWritingMode(final Fragment fragment, final float value) {

        switch (fragment.getWritingMode()) {
            case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
                fragment.setX1(value);
                break;
            case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                fragment.setX2(value);
                break;
            case PdfData.VERTICAL_BOTTOM_TO_TOP:
                fragment.setY2(value);
                break;
            case PdfData.VERTICAL_TOP_TO_BOTTOM:
                fragment.setY1(value);
                break;
            default:
                break;
        }
    }

    private void initArrays(final int count) {
        f_x1 = new float[count];
        f_colorTag = new String[count];
        hadSpace = new boolean[count];
        f_x2 = new float[count];
        f_y1 = new float[count];
        f_y2 = new float[count];

        spaceWidth = new float[count];
        content = new StringBuilder[count];
        fontSize = new int[count];
        textLength = new int[count];
        writingMode = new int[count];
        isUsed = new boolean[count];
    }

    /**
     * @param textValue
     * @return
     */
    private static boolean checkForPunctuation(final String textValue, final String punctuation) {

        if (punctuation == null || punctuation.isEmpty()) {
            return false;
        }

        //see if ends with punctuation
        boolean endsWithPunctuation = false;
        final int textLength = textValue.length();
        int ii = textLength - 1;
        if (textLength > 0) { //strip any spaces and tags in test
            char testChar = textValue.charAt(ii);
            boolean inTag = (testChar == '>');
            while (((inTag) | (testChar == ' ')) & (ii > 0)) {

                if (testChar == '<') {
                    inTag = false;
                }

                ii--;
                testChar = textValue.charAt(ii);

                if (testChar == '>') {
                    inTag = true;
                }
            }

            //stop  matches on &;
            if ((testChar == ';')) {
                //ignore if looks like &xxx;
                endsWithPunctuation = true;
                ii--;
                while (ii > -1) {

                    testChar = textValue.charAt(ii);
                    if (testChar == '&' || testChar == '#') {
                        endsWithPunctuation = false;
                        ii = 0;
                    }

                    if (ii == 0 || testChar == ' ' || !Character.isLetterOrDigit(testChar)) {
                        break;
                    }

                    ii--;
                }
            } else if (punctuation.indexOf(testChar) != -1) {
                endsWithPunctuation = true;
            }

        }
        return endsWithPunctuation;
    }

    /**
     * add an object to our new XML list
     */
    private void addFragment(
            final int index,
            final StringBuilder contentss,
            final float x1,
            final float x2,
            final float y1,
            final float y2,
            final boolean keepFontTokens, final Fragment fragment, final boolean isWordlist) {

        StringBuilder current_text = contentss;
        final String str = current_text.toString();

        final int text_len = fragment.getTextLength();
        final String currentColorTag = fragment.getColorTag();

        //strip <> or ascii equivalents
        if (isWordlist) {
            if (str.contains("&#")) {
                current_text = Strip.stripAmpHash(current_text);
            }

            if ((isXMLExtraction) && ((str.contains("<")) || (str.contains(">")))) {
                current_text = Strip.stripXMLArrows(current_text, true);
            } else if ((!isXMLExtraction) && ((str.indexOf('<') != -1) || (str.indexOf('>') != -1))) {
                current_text = Strip.stripArrows(current_text);
            }
        }

        //ignore blank space objects
        if (getFirstChar(current_text) != -1) {

            //strip tags or pick up missed  if ends with space
            if (!keepFontTokens) {

                //strip fonts if required
                current_text = Strip.stripXML(current_text, isXMLExtraction);

            } else if (isXMLExtraction) {

                //no color tag
                if (pdf_data.isColorExtracted() && (!current_text.toString().endsWith(GenericColorSpace.ce))) {

                    //if ends  add 
                    //otherwise add 
                    if (!current_text.toString().endsWith(Fonts.fe)) {
                        current_text = current_text.append(Fonts.fe);
                    }
                    current_text = current_text.append(GenericColorSpace.ce);

                } else if ((!pdf_data.isColorExtracted()) && (!current_text.toString().endsWith(Fonts.fe))) {
                    current_text = current_text.append(Fonts.fe);
                }
            }

            //add to vacant slot or create new slot
            int count = f_x1.length;

            if (nextSlot < count) {

                f_x1[nextSlot] = x1;
                f_colorTag[nextSlot] = currentColorTag;
                f_x2[nextSlot] = x2;
                f_y1[nextSlot] = y1;
                f_y2[nextSlot] = y2;

                fontSize[nextSlot] = pdf_data.f_end_font_size[index];
                writingMode[nextSlot] = pdf_data.f_writingMode[index];
                textLength[nextSlot] = text_len;

                spaceWidth[nextSlot] = pdf_data.space_width[index];
                content[nextSlot] = current_text;

                nextSlot++;
            } else {
                count += CONTENT_ARRAY_INCREMENT;
                final float[] t_x1 = new float[count];
                final String[] t_colorTag = new String[count];
                final float[] t_x2 = new float[count];
                final float[] t_y1 = new float[count];
                final float[] t_y2 = new float[count];
                final float[] t_spaceWidth = new float[count];

                final StringBuilder[] t_content = new StringBuilder[count];

                final int[] t_font_size = new int[count];
                final int[] t_text_len = new int[count];
                final int[] t_writingMode = new int[count];

                final boolean[] t_isUsed = new boolean[count];

                final boolean[] t_hadSpace = new boolean[count];

                //copy in existing
                for (int i = 0; i < count - CONTENT_ARRAY_INCREMENT; i++) {
                    t_x1[i] = f_x1[i];
                    t_colorTag[i] = f_colorTag[i];
                    t_x2[i] = f_x2[i];
                    t_y1[i] = f_y1[i];
                    t_y2[i] = f_y2[i];
                    t_hadSpace[i] = hadSpace[i];
                    t_spaceWidth[i] = spaceWidth[i];
                    t_content[i] = content[i];
                    t_font_size[i] = fontSize[i];
                    t_writingMode[i] = writingMode[i];
                    t_text_len[i] = textLength[i];
                    t_isUsed[i] = isUsed[i];
                }

                f_x1 = t_x1;
                f_colorTag = t_colorTag;
                hadSpace = t_hadSpace;
                f_x2 = t_x2;
                f_y1 = t_y1;
                f_y2 = t_y2;
                isUsed = t_isUsed;

                fontSize = t_font_size;
                writingMode = t_writingMode;
                textLength = t_text_len;

                spaceWidth = t_spaceWidth;

                content = t_content;

                f_x1[nextSlot] = x1;
                f_colorTag[nextSlot] = currentColorTag;
                f_x2[nextSlot] = x2;
                f_y1[nextSlot] = y1;
                f_y2[nextSlot] = y2;

                fontSize[nextSlot] = pdf_data.f_end_font_size[index];
                writingMode[nextSlot] = pdf_data.f_writingMode[index];
                t_text_len[nextSlot] = text_len;
                content[nextSlot] = current_text;

                spaceWidth[nextSlot] = pdf_data.space_width[index];

                nextSlot++;

            }

        }
    }

    //////////////////////////////////////////////////////////////////////

    /**
     * put rows together into one object with start and end
     */
    private void mergeTableRows(final int border_width) {

        //merge row contents
        String separator = "\n";

        if (!isXHTML) {
            separator = "\n";
        }

        master = ((Vector_Int) lines.elementAt(line_order[0])).elementAt(0);

        int item;
        for (int rr = 1; rr < max_rows; rr++) {

            item = ((Vector_Int) lines.elementAt(line_order[rr])).elementAt(0);
            if (content[master] == null) {
                master = item;
            } else if (content[item] != null) {
                merge(master, item, separator, false);
            }
        }

        //add start/end marker
        if (isXHTML) {
            if (border_width == 0) {
                content[master].insert(0, "\n");
                content[master].append("\n\n");
            } else {
                final StringBuilder startTag = new StringBuilder("\n");
                startTag.append(content[master]);
                content[master] = startTag;
                content[master].append("\n\n");
            }
        }

    }

    /**
     * get list of unused fragments and put in list and sort in sorted_items
     */
    private int[] getsortedUnusedFragments(
            final boolean sortOnX,
            final boolean use_y1) {
        final int total_fragments = isUsed.length;

        //get unused item pointers
        int ii = 0;
        final int[] sorted_temp_index = new int[total_fragments];
        for (int i = 0; i < total_fragments; i++) {
            if (!isUsed[i]) {
                sorted_temp_index[ii] = i;
                ii++;
            }
        }

        final int[] unsorted_items = new int[ii];
        final int[] sorted_items;
        final int[] sorted_temp_x1 = new int[ii];
        final int[] sorted_temp_y1 = new int[ii];
        final int[] sorted_temp_y2 = new int[ii];

        //put values in array and get x/y for sort
        for (int pointer = 0; pointer < ii; pointer++) {
            final int i = sorted_temp_index[pointer];
            unsorted_items[pointer] = i;

            sorted_temp_x1[pointer] = (int) f_x1[i];

            //negative values to get sort in 'wrong' order from top of page
            sorted_temp_y1[pointer] = (int) f_y1[i];
            sorted_temp_y2[pointer] = (int) f_y2[i];

        }

        //sort
        if (!sortOnX) {
            if (use_y1) {
                sorted_items =
                        Sorts.quicksort(
                                sorted_temp_y1,
                                sorted_temp_x1,
                                unsorted_items);
            } else {
                sorted_items =
                        Sorts.quicksort(
                                sorted_temp_y2,
                                sorted_temp_x1,
                                unsorted_items);
            }
        } else {
            sorted_items =
                    Sorts.quicksort(sorted_temp_x1, sorted_temp_y1, unsorted_items);
        }

        return sorted_items;
    }

    /**
     * create rows of data from preassembled indices, adding separators. Each
     * row is built to a temp array and then row created - we don't know how
     * many columns until the table is built
     *
     * @throws PdfException
     */
    private void createTableRows(
            final boolean keep_alignment_information,
            final boolean keep_width_information, final int currentWritingMode) throws PdfException {

        //create local copies of arrays
        final float[] l_x1;
        final float[] l_x2;

        //set pointers so left to right text
        switch (currentWritingMode) {
            case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
                l_x1 = this.f_x1;
                l_x2 = this.f_x2;
                break;
            case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                l_x2 = this.f_x1;
                l_x1 = this.f_x2;
                break;
            case PdfData.VERTICAL_BOTTOM_TO_TOP:
                l_x1 = this.f_y2;
                l_x2 = this.f_y1;
                break;
            case PdfData.VERTICAL_TOP_TO_BOTTOM:
                l_x1 = this.f_y1;
                l_x2 = this.f_y2;
                //fiddle x,y co-ords so it works

                //get max size
                int maxX = 0;
                for (final float aF_x1 : l_x1) {
                    if (maxX < aF_x1) {
                        maxX = (int) aF_x1;
                    }
                }
                maxX++; //allow for fp error
                //turn around
                for (int ii = 0; ii < l_x2.length; ii++) {
                    l_x1[ii] = maxX - l_x1[ii];
                    l_x2[ii] = maxX - l_x2[ii];
                }
                break;
            default:
                throw new PdfException("Illegal value " + currentWritingMode + "for currentWritingMode");
        }

        final Vector_Int[] rowContents = new Vector_Int[max_rows];
        final Vector_String alignments = new Vector_String(); //text alignment
        final Vector_Float widths = new Vector_Float(); //cell widths
        final Vector_Float cell_x1 = new Vector_Float(); //cell widths
        String separator = "", empty_cell = " ";

        if (!isXHTML) {
            separator = "\",\"";
            empty_cell = "";
        }

        final int[] itemCount = loadTableItemData(rowContents, alignments, widths, cell_x1, l_x1, l_x2);

        //now assemble rows
        for (int row = 0; row < max_rows; row++) {
            final StringBuilder line_content = new StringBuilder(100);

            int count = rowContents[row].size() - 1;
            master = ((Vector_Int) lines.elementAt(row)).elementAt(0);

            for (int i = 0; i < count; i++) {
                final int item = rowContents[row].elementAt(i);

                if (isXHTML) {

                    //get width
                    float current_width = widths.elementAt(i);
                    final String current_alignment = alignments.elementAt(i);
                    int test, colspan = 1, pointer = i + 1;

                    if (item != -1) {

                        //look for colspan
                        while (true) {
                            test = rowContents[row].elementAt(i + 1);
                            if ((test != -1) | (count == i + 1)) {
                                break;
                            }

                            //break if over another col - roll up single value on line
                            if (itemCount[row] > 1 && (cell_x1.elementAt(i + 1) > l_x2[item])) {
                                break;
                            }

                            count--;
                            rowContents[row].removeElementAt(i + 1);
                            colspan++;

                            //update width
                            current_width += widths.elementAt(pointer);
                            pointer++;
                        }
                    }
                    line_content.append(" 1) {
                            line_content.append(" colspan='").append(colspan).append('\'');
                        }
                    }

                    if (keep_width_information) {
                        line_content.append(" width='").append((int) current_width).append('\'');
                    }

                    line_content.append(" nowrap>");
                    if (item == -1) {
                        line_content.append(empty_cell);
                    } else {
                        line_content.append(content[item]);
                    }
                    line_content.append("");

                } else { //csv
                    if (item == -1) { //empty col
                        line_content.append("\"\",");
                    } else { //value
                        line_content.append('\"');
                        line_content.append(content[item]);
                        line_content.append("\",");
                    }
                }

                //merge to update other values
                if ((item != -1) && (master != item)) { //merge tracks the shape
                    merge(master, item, separator, false);
                }

            }
            //substitute our 'hand coded' value
            content[master] = line_content;

        }
    }

    /**
     * Split any overlap in items and return the item count
     */
    private int[] loadTableItemData(final Vector_Int[] rowContents, final Vector_String alignments, final Vector_Float widths, final Vector_Float cell_x1, final float[] l_x1, final float[] l_x2) {

        int itemsInTable = 0, items_added = 0, item, i;

        //pointer to current element on each row
        final int[] currentItem = new int[max_rows];

        //set number of items on each line, column count and populate empty rows
        final int[] itemCount = new int[max_rows];
        for (i = 0; i < max_rows; i++) {
            itemCount[i] = ((Vector_Int) lines.elementAt(i)).size() - 1;

            //total number of items
            itemsInTable += itemCount[i];

            //reset other values
            currentItem[i] = 0;
            rowContents[i] = new Vector_Int(20);
        }

        //now work through and split any overlapping items until all done
        while (true) {

            //size of column and pointers
            float x1 = 9999, min_x2 = 9999, x2, current_x1, current_x2, c_x1, next_x1 = 9999, c_x2, items_in_column = 0;

            boolean all_done = true; //flag to exit at end
            float total_x1 = 0;
            float total_x2 = 0;
            float left_gap = 0;
            final float right_gap;

            String alignment = "center";

            if (items_added < itemsInTable) {

                //work out cell x boundaries on basis of objects
                for (i = 0; i < max_rows; i++) { //get width for column
                    if (itemCount[i] > currentItem[i]) { //item  id

                        item = ((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i]);
                        current_x1 = l_x1[item];
                        current_x2 = l_x2[item];

                        if (current_x1 < x1) { //left margin
                            x1 = current_x1;
                        }
                        if (current_x2 < min_x2) { //right margin if appropriate
                            min_x2 = current_x2;
                        }

                    }
                }

                cell_x1.addElement(x1); //save left margin
                x2 = min_x2; //set default right margin

                //workout end and next column start by scanning all items
                for (i = 0; i < max_rows; i++) { //slot the next item on each row together work out item
                    item = ((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i]);
                    c_x1 = l_x1[item];
                    c_x2 = l_x2[item];

                    //max item width of this column
                    if ((c_x1 >= x1) & (c_x1 < min_x2) & (c_x2 > x2)) {
                        x2 = c_x2;
                    }

                    if (currentItem[i] < itemCount[i]) { //next left margin

                        item = ((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i] + 1);
                        current_x1 = l_x1[item];
                        if ((current_x1 > min_x2) & (current_x1 < next_x1)) {
                            next_x1 = current_x1;
                        }
                    }
                }

                //stop infinite loop case
                if (x1 == x2) {
                    break;
                }

                //allow for last column
                if (next_x1 == 9999) {
                    next_x1 = x2;
                }

                //Count items in table, workout raw totals for alignment and widest x2 in column.
                for (i = 0; i < max_rows; i++) { //slot the next item on each row together

                    //work out item
                    item = ((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i]);
                    c_x1 = l_x1[item];
                    c_x2 = l_x2[item];

                    //use items in first column of single colspan
                    if ((c_x1 >= x1) & (c_x1 < min_x2) & (c_x2 <= next_x1)) {

                        //running totals to calculate alignment
                        total_x1 += c_x1;
                        total_x2 += c_x2;
                        items_in_column++;

                    }
                }

                //work out gap and include empty space between cols and save
                if (i == 0) {
                    left_gap = x1;
                }
                if (next_x1 == -1) {
                    right_gap = 0;
                } else {
                    right_gap = (int) ((next_x1 - x2) / 2);
                }

                final int width = (int) (x2 - x1 + right_gap + left_gap);

                widths.addElement(width);

                //workout the alignment
                final float x1_diff = (total_x1 / items_in_column) - x1;
                final float x2_diff = x2 - (total_x2 / items_in_column);
                if (x1_diff < 1) {
                    alignment = "left";
                } else if (x2_diff < 1) {
                    alignment = "right";
                }
                alignments.addElement(alignment);

                for (i = 0; i < max_rows; i++) { //slot the next item on each row together
                    master = ((Vector_Int) lines.elementAt(i)).elementAt(0);
                    //get next item on line or -1 for no more
                    if (itemCount[i] > currentItem[i]) {
                        //work out item
                        item = ((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i]);
                        c_x1 = l_x1[item];
                        all_done = false;

                    } else {
                        item = -1;
                        c_x1 = -1;
                    }

                    if ((item == -1) & (items_added <= itemsInTable)) {
                        //all items in table so just filling in gaps
                        rowContents[i].addElement(-1);

                    } else if ((c_x1 >= x1) & (c_x1 < x2)) {
                        //fits into cell so add in and roll on marker

                        rowContents[i].addElement(item);
                        currentItem[i]++;

                        items_added++;
                    } else if (c_x1 > x2) { //empty cell
                        rowContents[i].addElement(-1);
                    }
                }
            }
            if (all_done) {
                break;
            }
        }
        return itemCount;
    }

    /**
     * work through data and create a set of rows and return an object with
     * refs for each line
     *
     * @throws PdfException
     */
    private void createLinesInTable(final int itemCount, int[] items, final boolean addSpaceXMLTag, final int mode) throws PdfException {

        //reverse order if text right to left
        if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
            items = reverse(items);
        }

        //create and populate local copies of arrays
        final float[] l_x1;
        final float[] l_x2;
        final float[] l_y1;
        final float[] l_y2;

        // set pointers so always left to right text
        switch (mode) {
            case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
                l_x1 = this.f_x1;
                l_x2 = this.f_x2;
                l_y1 = this.f_y1;
                l_y2 = this.f_y2;
                break;

            case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                l_x2 = this.f_x1;
                l_x1 = this.f_x2;
                l_y1 = this.f_y1;
                l_y2 = this.f_y2;
                break;

            case PdfData.VERTICAL_BOTTOM_TO_TOP:
                l_x1 = this.f_y1;
                l_x2 = this.f_y2;
                l_y1 = this.f_x2;
                l_y2 = this.f_x1;
                break;

            case PdfData.VERTICAL_TOP_TO_BOTTOM:
                l_x1 = this.f_y2;
                l_x2 = this.f_y1;
                l_y2 = this.f_x1;
                l_y1 = this.f_x2;
                items = this.getsortedUnusedFragments(false, true);
                items = reverse(items);
                break;

            default:
                throw new PdfException("Illegal value " + mode + "for currentWritingMode");
        }

        //holds line we're working on
        Vector_Int current_line;

        for (int j = 0; j < itemCount; j++) { //for all items

            final int c = items[j];
            int id = -1;
            int i;
            int last = c;
            float smallest_gap = -1, gap, yMidPt;

            if (!isUsed[c] && this.writingMode[c] == mode) {

                //reset pointer and add this element
                current_line = new Vector_Int(20);
                current_line.addElement(c);
                lineY2.addElement((int) l_y2[c]);

                //look for items along same line (already sorted into order left to right)
                while (true) {   //look for a match
                    for (int ii = 0; ii < itemCount; ii++) {

                        i = items[ii];

                        if (!isUsed[i] && i != c && writingMode[c] == mode && ((l_x1[i] > l_x1[c] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM) || (l_x1[i] < l_x1[c] && mode == PdfData.VERTICAL_TOP_TO_BOTTOM))) { //see if on right

                            gap = (l_x1[i] - l_x2[c]);

                            if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
                                gap = -gap;
                            }

                            //allow for fp error
                            if (gap < 0 && gap > -2) {
                                gap = 0;
                            }

                            //make sure on right
                            yMidPt = (l_y1[i] + l_y2[i]) / 2;

                            //see if line & if only or better fit
                            if (yMidPt < l_y1[c] && yMidPt > l_y2[c] && (smallest_gap < 0 || gap < smallest_gap)) {
                                smallest_gap = gap;
                                id = i;
                            }
                        }
                    }

                    if (id == -1) { //exit when no more matches
                        break;
                    }

                    //merge in best match if fit found with last or if overlaps by less than half a space,otherwise join
                    float t = l_x1[id] - l_x2[last], possSpace = l_x1[id] - l_x2[c];
                    float av_char1 = (float) 1.5 * ((l_x2[id] - l_x1[id]) / textLength[id]);
                    float av_char2 = (float) 1.5 * ((l_x2[last] - l_x1[last]) / textLength[last]);

                    if ((mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM)) {
                        possSpace = -possSpace;
                        t = -t;
                        av_char1 = -av_char1;
                        av_char2 = -av_char2;
                    }

                    if (t < av_char1 && t < av_char2) {
                        merge(last, id, isGapASpace(id, last, possSpace, addSpaceXMLTag, mode), true);
                    } else {
                        current_line.addElement(id);
                        last = id;
                    }

                    //flag used and reset variables used
                    isUsed[id] = true;
                    id = -1;
                    smallest_gap = 1000000;

                }

                //add line to list
                lines.addElement(current_line);
                max_rows++;
            }
        }
    }

    /**
     * Calls various low level merging routines on merge -
     * 
     * isCSV sets if output is XHTML or CSV format -
     * 

     * XHTML also has options to include font tags (keepFontInfo),
     * preserve widths (keepWidthInfo), try to preserve alignment
     * (keepAlignmentInfo), and set a table border width (borderWidth)
     * - AddCustomTags should always be set to false
     *
     * @param x1                is the x coord of the top left corner
     * @param y1                is the y coord of the top left corner
     * @param x2                is the x coord of the bottom right corner
     * @param y2                is the y coord of the bottom right corner
     * @param pageNumber        is the page you wish to extract from
     * @param isCSV             is a boolean. If false the output is xhtml if true the text is out as CSV
     * @param keepFontInfo      if true and isCSV is false keeps font information in extrated text.
     * @param keepWidthInfo     if true and isCSV is false keeps width information in extrated text.
     * @param keepAlignmentInfo if true and isCSV is false keeps alignment information in extrated text.
     * @param borderWidth       is the width of the border for xhtml
     * @return Map containing text found in estimated table cells
     * @throws PdfException If the co-ordinates are not valid
     */
    @SuppressWarnings("UnusedParameters")
    protected final Map extractTextAsTable(
            int x1,
            int y1,
            int x2,
            int y2,
            final int pageNumber,
            final boolean isCSV,
            final boolean keepFontInfo,
            final boolean keepWidthInfo,
            final boolean keepAlignmentInfo,
            final int borderWidth)
            throws PdfException {

        //check in correct order and throw exception if not
        final int[] v = validateCoordinates(x1, y1, x2, y2);
        x1 = v[0];
        y1 = v[1];
        x2 = v[2];
        y2 = v[3];

        //Return the content as an Element
        final Map table_content = new HashMap();

        LogWriter.writeLog("extracting Text As Table");

        //flag type of table so we can add correct separators
        isXHTML = !isCSV;

        //init table variables
        lines = new Vector_Object(20);
        lineY2 = new Vector_Int(20);
        max_rows = 0;

        //init store for data
        copyToArrays(x1, y2, x2, y1, keepFontInfo, false, true, null, false);

        //initial grouping and delete any hidden text
        removeEncoding();

        //eliminate shadows and also merge overlapping text
        cleanupShadowsAndDrownedObjects(false);

        final int[] items = this.getsortedUnusedFragments(true, false);
        final int item_count = items.length; //number of items

        if (item_count == 0) {
            return table_content;
        }

        //check orientation and get preferred. Items not correct will be ignored
        final int currentWritingMode = getWritingMode(items, item_count);

        final String message = "Table Merging algorithm being applied " + (item_count) + " items";
        LogWriter.writeLog(message);

        //scan all items joining best fit to right of each fragment to build lines
        if (item_count > 1) {

            //workout the raw lines
            createLinesInTable(item_count, items, isXHTML, currentWritingMode);

            //generate lookup with lines in correct order (minus used to get correct order down the page)
            int dx = 1;
            if (currentWritingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || currentWritingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
                dx = -1;
            }

            line_order = new int[max_rows];
            final int[] line_y = new int[max_rows];

            for (int i = 0; i < max_rows; i++) {
                line_y[i] = dx * lineY2.elementAt(i);
                line_order[i] = i;
            }

            line_order = Sorts.quicksort(line_y, line_order);

            //assemble the rows and columns
            createTableRows(keepAlignmentInfo, keepWidthInfo, currentWritingMode);

            //assemble the rows and columns
            mergeTableRows(borderWidth);

        }

        content[master] = cleanup(content[master]);

        String processed_value = content[master].toString();

        if (processed_value != null) {

            //cleanup data if needed by removing duplicate font tokens
            if (!isCSV) {
                processed_value = Fonts.cleanupTokens(processed_value);
            }

            table_content.put("content", processed_value);
            table_content.put("x1", String.valueOf(x1));
            table_content.put("x2", String.valueOf(x2));
            table_content.put("y1", String.valueOf(y1));
            table_content.put("y2", String.valueOf(y2));
        }

        return table_content;
    }

    /**
     * make sure co-ords valid and throw exception if not
     */
    private static int[] validateCoordinates(int x1, int y1, int x2, int y2) {
        if ((x1 > x2) | (y1 < y2)) {
            if (x1 > x2) {
                final int temp = x1;
                x1 = x2;
                x2 = temp;
                LogWriter.writeLog("x1 > x2, coordinates were swapped to validate");
            }

            if (y1 < y2) {
                final int temp = y1;
                y1 = y2;
                y2 = temp;
                LogWriter.writeLog("y1 < y2, coordinates were swapped to validate");
            }
        }
        return new int[]{x1, y1, x2, y2};
    }

    /**
     * Algorithm to place data from within coordinates to a vector of word, word coords (x1,y1,x2,y2)
     *
     * @param x1             is the x coord of the top left corner
     * @param y1             is the y coord of the top left corner
     * @param x2             is the x coord of the bottom right corner
     * @param y2             is the y coord of the bottom right corner
     * @param page_number    is the page you wish to extract from
     * @param breakFragments will divide up text based on white space characters
     * @param punctuation    is a string containing all values that should be used to divide up words
     * @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...)
     * @throws PdfException If the co-ordinates are not valid
     */
    @SuppressWarnings("UnusedParameters")
    protected final List extractTextAsWordlist(
            int x1,
            int y1,
            int x2,
            int y2,
            final int page_number,
            final boolean breakFragments,
            final String punctuation)
            throws PdfException {

        //make sure co-ords valid and throw exception if not
        final int[] v = validateCoordinates(x1, y1, x2, y2);
        x1 = v[0];
        y1 = v[1];
        x2 = v[2];
        y2 = v[3];

        final boolean useUnrotatedCoords = PdfGroupingAlgorithms.useUnrotatedCoords;

        //extract the raw fragments (Note order or parameters passed)
        if (breakFragments) {
            copyToArrays(x1, y2, x2, y1, true, true, false, punctuation, true);
        } else {
            copyToArrays();
        }


        //delete any hidden text
        removeEncoding();

        //eliminate shadows and also merge overlapping text
        cleanupShadowsAndDrownedObjects(true);

        final int[] items = getsortedUnusedFragments(true, false);
        final int count = items.length;

        //if no values return null
        if (count == 0) {
            LogWriter.writeLog("Less than 1 text item on page");

            return null;
        }

        //check orientation and get preferred. Items not correct will be ignored
        final int currentWritingMode = getWritingMode(items, count);

        //build set of lines from text
        createLines(count, items, currentWritingMode, true, false, false, false);

        //alter co-ords to rotated if requested
        float[] l_x1 = null, l_x2 = null, l_y1 = null, l_y2 = null;

        if (useUnrotatedCoords || currentWritingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
            l_x1 = this.f_x1;
            l_x2 = this.f_x2;
            l_y1 = this.f_y1;
            l_y2 = this.f_y2;
        } else if (currentWritingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
            l_x2 = this.f_x1;
            l_x1 = this.f_x2;
            l_y1 = this.f_y1;
            l_y2 = this.f_y2;
        } else if (currentWritingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
            l_x1 = this.f_y2;
            l_x2 = this.f_y1;
            l_y1 = this.f_x2;
            l_y2 = this.f_x1;
        } else if (currentWritingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
            l_x1 = this.f_y1;
            l_x2 = this.f_y2;
            l_y2 = this.f_x1;
            l_y1 = this.f_x2;
        }

        //put into a Vector
        final List values = new ArrayList();

        for (int i = 0; i < content.length; i++) {
            if (content[i] != null) {

                if ((colorExtracted) && (isXMLExtraction)) {
                    if (!content[i].toString().toLowerCase().startsWith(GenericColorSpace.cb)) {
                        content[i].insert(0, f_colorTag[master]);
                    }
                    if (!content[i].toString().toLowerCase().endsWith(GenericColorSpace.ce)) {
                        content[i].append(GenericColorSpace.ce);
                    }
                }

                if (isXMLExtraction) {
                    values.add((content[i]).toString());
                } else {
                    values.add(Strip.convertToText((content[i]).toString(), isXMLExtraction));
                }

                if ((!useUnrotatedCoords) && (currentWritingMode == PdfData.VERTICAL_TOP_TO_BOTTOM)) {
                    values.add(String.valueOf(l_x1[i]));
                    values.add(String.valueOf(l_y1[i]));
                    values.add(String.valueOf(l_x2[i]));
                    values.add(String.valueOf(l_y2[i]));
                } else if ((!useUnrotatedCoords) && (currentWritingMode == PdfData.VERTICAL_BOTTOM_TO_TOP)) {
                    values.add(String.valueOf(l_x1[i]));
                    values.add(String.valueOf(l_y2[i]));
                    values.add(String.valueOf(l_x2[i]));
                    values.add(String.valueOf(l_y1[i]));
                } else {
                    values.add(String.valueOf(l_x1[i]));
                    values.add(String.valueOf(l_y1[i]));
                    values.add(String.valueOf(l_x2[i]));
                    values.add(String.valueOf(l_y2[i]));
                }
            }
        }

        LogWriter.writeLog("Text extraction as wordlist completed");

        return values;

    }

    /**
     * reset global values
     */
    private void reset() {

        isXHTML = true;
        nextSlot = 0;

        lineBreaks = new Vector_Int();

        max_rows = 0;
        master = 0;

        colorExtracted = false;

    }

    /**
     * Algorithm to place data from specified coordinates on a page into a String.
     *
     * @param x1                 is the x coord of the top left corner
     * @param y1                 is the y coord of the top left corner
     * @param x2                 is the x coord of the bottom right corner
     * @param y2                 is the y coord of the bottom right corner
     * @param page_number        is the page you wish to extract from
     * @param estimateParagraphs will attempt to find paragraphs and add new lines in output if true
     * @param breakFragments     will divide up text based on white space characters if true
     * @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...)
     * @throws PdfException If the co-ordinates are not valid
     */
    @SuppressWarnings("UnusedParameters")
    protected final String extractTextInRectangle(
            int x1,
            int y1,
            int x2,
            int y2,
            final int page_number,
            final boolean estimateParagraphs,
            final boolean breakFragments)
            throws PdfException {


        reset();

        if ((breakFragments) && (!pdf_data.IsEmbedded())) {
            throw new PdfException("[PDF] Request to breakfragments and width not added. Please add call to init(true) of PdfDecoder to your code.");
        }

        //make sure co-ords valid and throw exception if not
        final int[] v = validateCoordinates(x1, y1, x2, y2);
        x1 = v[0];
        y1 = v[1];
        x2 = v[2];
        y2 = v[3];

        //extract the raw fragments (Note order or parameters passed)
        if (breakFragments) {
            copyToArrays(x1, y2, x2, y1, (isXMLExtraction), false, false, null, false);
        } else {
            copyToArrays();
        }

        //delete any hidden text
        removeEncoding();

        //eliminate shadows and also merge overlapping text
        cleanupShadowsAndDrownedObjects(false);

        //get the fragments as an array
        final int[] items = getsortedUnusedFragments(true, false);
        final int count = items.length;

        //if no values return null
        if (count == 0) {
            LogWriter.writeLog("Less than 1 text item on page");

            return null;
        }

        //check orientation and get preferred. Items not correct will be ignored
        final int currentWritingMode = getWritingMode(items, count);

        //build set of lines from text
        createLines(count, items, currentWritingMode, false, isXMLExtraction, false, false);

        //roll lines together
        final int masterIndex = mergeLinesTogether(currentWritingMode, estimateParagraphs, x1, x2, y1, y2);

        //add final deliminators
        if (isXMLExtraction) {
            content[masterIndex] = new StringBuilder(Fonts.cleanupTokens(content[masterIndex].toString()));
            content[masterIndex].insert(0, "
");
            content[masterIndex].append("");
        }

        LogWriter.writeLog("Text extraction completed");

        return cleanup(content[masterIndex]).toString();

    }


    private StringBuilder cleanup(StringBuilder buffer) {

        if (buffer == null) {
            return buffer;
        }

        //sort out & to &
        if (isXMLExtraction) {
            String buf = buffer.toString();

            buf = buf.replaceAll("&#", "XX#");
            buf = buf.replaceAll("<", "XXlt");
            buf = buf.replaceAll(">", "XXgt");

            buf = buf.replaceAll("&", "&");

            //put back others
            buf = buf.replaceAll("XX#", "&#");
            buf = buf.replaceAll("XXlt", "<");
            buf = buf.replaceAll("XXgt", ">");

            final boolean removeInvalidXMLValues = true;
            if (removeInvalidXMLValues) {
            
	            /*
                 * Restricted Char ::=
				 *	[#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]
				 *  [#x1-#x8] | [#x11-#x12] | [#x14-#x31] | [#x127-#x132] | [#x134-#x159]
				 */

                //set mappings
                final Map asciiMappings = new HashMap();
                //[#x1-#x8]
                for (int i = 1; i <= 8; i++) {
                    asciiMappings.put("&#" + i + ';', "");
                }

                //[#x11-#x12]
                for (int i = 11; i <= 12; i++) {
                    asciiMappings.put("&#" + i + ';', "");
                }

                //[#x14-#x31]
                for (int i = 14; i <= 31; i++) {
                    asciiMappings.put("&#" + i + ';', "");
                }

                //[#x127-#x132]
                //for (int i = 127; i <= 132; i++)
                //asciiMappings.put("&#" + i + ";", "");

                //[#x134-#x159]
                //for (int i = 134; i <= 159; i++)
                //asciiMappings.put("&#" + i + ";", "");


                //substitute illegal XML characters for mapped values
                for (final Object o : asciiMappings.keySet()) {
                    final String character = (String) o;
                    final String mappedCharacter = asciiMappings.get(character);

                    buf = buf.replace(character, mappedCharacter);
                }
            }
            buffer = new StringBuilder(buf);
        }

        return buffer;
    }

    /**
     * scan fragments and detect orientation. If multiple,
     * prefer horizontal
     */
    private int getWritingMode(final int[] items, final int count) {

        final int[] counts = new int[4];
        for (int j = 0; j < count; j++) {
            final int c = items[j];

            if ((!isUsed[c])) {
                counts[writingMode[c]]++;
            }
        }

        int mode = 0;
        for (int i = 1; i != counts.length; i++) {
            if (counts[i] > counts[mode]) {
                mode = i;
            }
        }

        return mode;
    }

    /**
     * @param estimateParagraphs
     * @return
     * @throws PdfException
     */
    private int mergeLinesTogether(final int currentWritingMode, final boolean estimateParagraphs, final int x1, final int x2, final int y1, final int y2) throws PdfException {

        StringBuilder separator;

        int[] indices;

        //used for working out alignment
        final int middlePage;

        //create local copies of
        final float[] l_x1;
        final float[] l_x2;
        final float[] l_y1;
        final float[] l_y2;

        switch (currentWritingMode) {
            case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
                l_x1 = this.f_x1;
                l_x2 = this.f_x2;
                l_y1 = this.f_y1;
                l_y2 = this.f_y2;
                indices = getsortedUnusedFragments(false, true);
                middlePage = (x1 + x2) / 2;
                break;
            case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                l_x2 = this.f_x1;
                l_x1 = this.f_x2;
                l_y1 = this.f_y1;
                l_y2 = this.f_y2;
                indices = getsortedUnusedFragments(false, true);
                middlePage = (x1 + x2) / 2;
                break;
            case PdfData.VERTICAL_BOTTOM_TO_TOP:
                l_x1 = this.f_y1;
                l_x2 = this.f_y2;
                l_y1 = this.f_x2;
                l_y2 = this.f_x1;
                indices = getsortedUnusedFragments(true, true);
                indices = reverse(indices);
                middlePage = (y1 + y2) / 2;
                break;
            case PdfData.VERTICAL_TOP_TO_BOTTOM:
                l_x1 = this.f_y2;
                l_x2 = this.f_y1;
                l_y2 = this.f_x2;
                l_y1 = this.f_x1;
                indices = getsortedUnusedFragments(true, true);
                middlePage = (y1 + y2) / 2;
                break;
            default:
                throw new PdfException("Illegal value " + currentWritingMode + "for currentWritingMode");
        }
        final int quarter = middlePage / 2;
        final int count = indices.length;
        final int masterIndex = indices[count - 1];

        //now loop through all lines merging
        int ClastChar, MlastChar, CFirstChar;

        for (int i = count - 2; i > -1; i--) {

            final int child = indices[i];
            separator = new StringBuilder();

            //-1 if no chars
            ClastChar = getLastChar(content[child]);

            if (ClastChar != -1) {

                addAlignmentFormatting(estimateParagraphs, middlePage, l_x1, l_x2, quarter, child);

                //see if we insert a line break and merge
                String lineSpace = "
" + SYSTEM_LINE_SEPARATOR + "";
                if (isXMLExtraction) {
                    lineSpace = SYSTEM_LINE_SEPARATOR;
                }

                float gap = l_y2[masterIndex] - l_y1[child];
                float line_height = l_y1[child] - l_y2[child];

                //Added for case where line can be less than 1 in height and cause the extraction
                //to hang and excessive new lines to be added
                if (line_height < 1) {
                    line_height = l_y1[masterIndex] - l_y2[masterIndex];
                }

                if (currentWritingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
                    gap = -gap;
                    line_height = -line_height;
                }

                if ((gap > line_height) & (line_height > 0)) { //add in line gaps

                    while (gap > line_height) {
                        separator.append(lineSpace);
                        gap -= line_height;
                    }

                    if (isXMLExtraction) {
                        separator.append("
").append(SYSTEM_LINE_SEPARATOR).append("");
                    } else {
                        separator = new StringBuilder(SYSTEM_LINE_SEPARATOR);
                    }

                } else if (estimateParagraphs) {

                    CFirstChar = getFirstChar(content[child]);
                    MlastChar = getLastChar(content[masterIndex]);

                    if ((((MlastChar == '.')) || (((MlastChar == '\"')))) && ((CFirstChar >= 'A') && (CFirstChar <= 'Z'))) {
                        if (isXMLExtraction) {
                            separator.append("
").append(SYSTEM_LINE_SEPARATOR).append("");
                        } else {
                            separator = new StringBuilder(SYSTEM_LINE_SEPARATOR);
                        }
                    } else if (fontSize[child] > 70 && fontSize[child] == fontSize[masterIndex] && line_height > 70 && gap > 5 && line_height > 0) { //add in spaces

                        if (isXMLExtraction) {
                            content[child].insert(0, ' ');
                        } else {
                            content[masterIndex].append(' ');
                        }
                    }

                } else {
                    if (isXMLExtraction) {
                        content[child].insert(0, "
" + SYSTEM_LINE_SEPARATOR + "");
                    } else {
                        content[masterIndex].append(SYSTEM_LINE_SEPARATOR);
                    }
                }

                merge(masterIndex, child, separator.toString(), false);

            }
        }
        return masterIndex;
    }

    private int getFirstChar(final StringBuilder buffer) {

        int i = -1;
        boolean inTag = false;
        final int count = buffer.length();
        char openChar = ' ';
        int ptr = 0;

        while (ptr < count) {
            final char nextChar = buffer.charAt(ptr);

            if ((!inTag) && (isXMLExtraction && (nextChar == '<' || nextChar == '&'))) {
                inTag = true;
                openChar = nextChar;

                //trap & .... &xx; or other spurious
                if ((openChar == '&')) {
                    if ((ptr + 1) == count) {
                        i = '&';
                        ptr = count;
                    } else {
                        final char c = buffer.charAt(ptr + 1);

                        if ((c != '#') && (c != 'g') && (c != 'l')) {
                            i = '&';
                            ptr = count;
                        }
                    }
                }
            }

            if ((!inTag) && (nextChar != ' ')) {
                i = nextChar;
                ptr = count;
            }

            //allow for valid & in stream
            if ((inTag) && (openChar == '&') && (nextChar == ' ')) {
                i = openChar;
                ptr = count;
            } else if ((inTag) && (isXMLExtraction && ((nextChar == '>') || (openChar == '&' && nextChar == ';')))) {

                //put back < or >
                if (nextChar == ';' && openChar == '&' && ptr > 2 && buffer.charAt(ptr - 1) == 't') {
                    if ((buffer.charAt(ptr - 2) == 'l')) {
                        i = '<';
                        ptr = count;
                    } else if ((buffer.charAt(ptr - 2) == 'g')) {
                        i = '>';
                        ptr = count;
                    }
                }

                inTag = false;
            }

            ptr++;
        }

        return i;
    }

    /**
     * return char as int or -1 if no match
     */
    private int getLastChar(final StringBuilder buffer) {

        int i = -1;
        boolean inTag = false;
        int count = buffer.length();
        final int size = count;
        char openChar = ' ';
        count--; //knock 1 off so points to last char

        while (count > -1) {
            final char nextChar = buffer.charAt(count);

            //trap &xx;;
            if (inTag && openChar == ';' && nextChar == ';') {
                i = ';';
                count = -1;
            }

            if (!inTag && (nextChar == '>' || (isXMLExtraction && nextChar == ';'))) {
                inTag = true;

                //check it is a token and not just > at end
                final int lastTokenStart = buffer.lastIndexOf("') {
                            inTag = false;
                            ptr = count;
                        }
                    }
                }

                if (inTag) {
                    openChar = nextChar;
                } else {
                    i = nextChar;
                    count = -1;
                }
            }

            if (!inTag && nextChar != 32) {
                i = nextChar;
                count = -1;
            }

            if (nextChar == '<' || (isXMLExtraction && openChar == ';' && nextChar == '&')) {
                inTag = false;

                //put back < or >
                if (nextChar == '&' && (count + 3 < size) && (buffer.charAt(count + 2) == 't') && (buffer.charAt(count + 3) == ';')) {
                    if ((buffer.charAt(count + 1) == 'l')) {
                        i = '<';
                        count = -1;
                    } else if ((buffer.charAt(count + 1) == 'g')) {
                        i = '>';
                        count = -1;
                    }
                }
            }

            if (inTag && openChar == ';' && nextChar == ' ') {
                count = -1;
                i = ';';
            }
            count--;
        }

        return i;
    }

    /**
     * reverse order in matrix so back to front
     */
    private static int[] reverse(final int[] indices) {
        final int count = indices.length;
        final int[] newIndex = new int[count];
        for (int i = 0; i < count; i++) {
            newIndex[i] = indices[count - i - 1];
        }
        return newIndex;
    }

    /**
     * used to add LEFT,CENTER,RIGHT tags into XML when extracting text
     */
    private void addAlignmentFormatting(final boolean estimateParagraphs, final int middlePage, final float[] f_x1, final float[] f_x2, final int quarter, final int child) {
        //put in some alignment
        final float left_gap = middlePage - f_x1[child];
        final float right_gap = f_x2[child] - middlePage;
        if ((!estimateParagraphs) && (isXMLExtraction) &&
                (left_gap > 0) && (right_gap > 0) && (f_x1[child] > quarter) && (f_x1[child] < (middlePage + quarter))) {

            float ratio = left_gap / right_gap;
            if (ratio > 1) {
                ratio = 1 / ratio;
            }

            if (ratio > 0.95) {  //add centring if seems centered around middle
                content[child] = new StringBuilder(Fonts.cleanupTokens(content[child].toString()));
                content[child].insert(0, "
");
                content[child].append("\n");
            } else if ((right_gap < 10) & (left_gap > 30)) {  //add right align
                content[child] = new StringBuilder(Fonts.cleanupTokens(content[child].toString()));
                content[child].insert(0, "");
                content[child].append("\n");

            }
        }
    }

    /**
     * convert fragments into lines of text
     */
    private void createLines(final int count, int[] items, final int mode, final boolean breakOnSpace, final boolean addMultiplespaceXMLTag, final boolean sameLineOnly, final boolean isSearch) throws PdfException {

        String separator;

        final boolean debug = false;

        //create local copies of arrays
        final float[] l_x1;
        final float[] l_x2;
        final float[] l_y1;
        final float[] l_y2;

        //reverse order if text right to left
        if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
            items = reverse(items);
        }

        //set pointers so left to right text
        switch (mode) {
            case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
                l_x1 = this.f_x1;
                l_x2 = this.f_x2;
                l_y1 = this.f_y1;
                l_y2 = this.f_y2;
                break;
            case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
                l_x2 = this.f_x1;
                l_x1 = this.f_x2;
                l_y1 = this.f_y1;
                l_y2 = this.f_y2;
                break;
            case PdfData.VERTICAL_BOTTOM_TO_TOP:
                l_x1 = this.f_y1;
                l_x2 = this.f_y2;
                l_y1 = this.f_x2;
                l_y2 = this.f_x1;
                break;
            case PdfData.VERTICAL_TOP_TO_BOTTOM:
                l_x1 = this.f_y2;
                l_x2 = this.f_y1;
                l_y2 = this.f_x1;
                l_y1 = this.f_x2;
                break;
            default:
                throw new PdfException("Illegal value " + mode + "for currentWritingMode");
        }

        //Scan items joining best fit to right of each fragment to build lines.
        for (int j = 0; j < count; j++) {

            int id = -1, i;
            final int c = items[j];

            float smallest_gap = -1, gap, yMidPt;
            if (!isUsed[c] && this.writingMode[c] == mode) {

//				if(debug) {
//                    System.out.println("Look for match with "+removeHiddenMarkers(content[c].toString()));
//                }

                while (true) {
                    for (int j2 = 0; j2 < count; j2++) {
                        i = items[j2];

                        if (!isUsed[i]) {

                            //amount of variation in bottom of text
                            int baseLineDifference = (int) (l_y2[i] - l_y2[c]);
                            if (baseLineDifference < 0) {
                                baseLineDifference = -baseLineDifference;
                            }

                            //amount of variation in bottom of text
                            int topLineDifference = (int) (l_y1[i] - l_y1[c]);
                            if (topLineDifference < 0) {
                                topLineDifference = -topLineDifference;
                            }

                            // line gap
                            int lineGap = (int) (l_x1[i] - l_x2[c]);

                            //Check if fragments are closer from the other end
                            if (!isSearch && lineGap > (int) (l_x1[c] - l_x2[i])) {
                                lineGap = (int) (l_x1[c] - l_x2[i]);
                            }

                            int fontSizeChange = fontSize[c] - fontSize[i];
                            if (fontSizeChange < 0) {
                                fontSizeChange = -fontSizeChange;
                            }

//							if(debug) {
//                                System.out.println("Against "+removeHiddenMarkers(content[i].toString()));
//                            }

                            if (sameLineOnly && lineGap > fontSize[c] && lineGap > 0) { //ignore text in wrong order allowing slight margin for error
                                // allow for multicolumns with gap

                                if (debug) {
                                    System.out.println("case1 lineGap=" + lineGap);
                                }
                            } else if (sameLineOnly && baseLineDifference > 1 && lineGap > 2 * fontSize[c] && (fontSize[c] == fontSize[i])) { //TEXT SLIGHTLY OFFSET
                                //ignore
                                if (debug) {
                                    System.out.println("case3");
                                }
                            } else if (sameLineOnly && baseLineDifference > 3) {
                                //ignore
                                if (debug) {
                                    System.out.println("case4");
                                }
                            } else if (sameLineOnly && fontSizeChange > 2) {
                                //ignore
                                if (debug) {
                                    System.out.println("case5");
                                }
                            } else if ((isSearch && (i != c && !(lineGap > 2 * fontSize[c] || -lineGap > 2 * fontSize[c]) &&
                                    ((l_x1[i] > l_x1[c] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM) ||
                                            (l_x1[i] < l_x1[c] && mode == PdfData.VERTICAL_TOP_TO_BOTTOM) &&
                                                    writingMode[c] == mode &&
                                                    (!(fontSizeChange > 2) || (fontSizeChange > 2 && topLineDifference < 3)))))
                                    ||
                                    (!isSearch && (i != c && ((l_x1[i] > l_x1[c] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM) ||
                                            l_x1[i] < l_x1[c] && mode == PdfData.VERTICAL_TOP_TO_BOTTOM && writingMode[c] == mode
                                                    && (!(fontSizeChange > 2) || (fontSizeChange > 2 && topLineDifference < 3))
                                    )))
                                    ) { //see if on right

                                gap = (l_x1[i] - l_x2[c]);

                                if (debug) {
                                    System.out.println("case6 gap=" + gap);
                                }

                                if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
                                    gap = -gap;
                                }

                                //allow for fp error
                                if ((gap < 0) && (gap > -2)) {
                                    gap = 0;
                                }

                                //make sure on right
                                yMidPt = (l_y1[i] + l_y2[i]) / 2;

                                //see if line & if only or better fit
                                if ((yMidPt < l_y1[c]) &&
                                        (yMidPt > l_y2[c]) &&
                                        ((smallest_gap < 0) || (gap < smallest_gap))) {
                                    smallest_gap = gap;
                                    id = i;
                                }
                            }
                        }
                    }

                    //merge on next right item or exit when no more matches
                    if (id == -1) {
                        break;
                    }

                    float possSpace = l_x1[id] - l_x2[c];
                    if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
                        possSpace = -possSpace;
                    } else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
                        possSpace = (l_x2[id] - l_x1[c]);
                    }

                    //add space if gap between this and last object
                    separator = isGapASpace(c, id, possSpace, addMultiplespaceXMLTag, mode);

                    //merge if adjoin
                    if ((breakOnSpace) &&
                            (((hadSpace != null) && ((hadSpace[c]) || (separator.startsWith(" ")))))) {
                        break;
                    }

                    merge(c, id, separator, true);


                    id = -1; //reset
                    smallest_gap = 1000000; //and reset the gap

                }
            }
        }
    }

    protected static class ResultsComparatorRectangle implements Comparator