org.jpedal.grouping.PdfGroupingAlgorithms Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of jsign-jpedal Show documentation
JPedal fork
The newest version!
/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/java-pdf-library-support/
 *
 * (C) Copyright 1997-2013, IDRsolutions and Contributors.
 *
 * 	This file is part of JPedal
 *
     This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


 *
 * ---------------
 * PdfGroupingAlgorithms.java
 * ---------------
 */
package org.jpedal.grouping;

import java.awt.Point;
import java.awt.Rectangle;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jpedal.color.GenericColorSpace;
import org.jpedal.exception.PdfException;
import org.jpedal.objects.PdfData;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.Fonts;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Sorts;
import org.jpedal.utils.Strip;
import org.jpedal.utils.repositories.Vector_Float;
import org.jpedal.utils.repositories.Vector_Int;
import org.jpedal.utils.repositories.Vector_Object;
import org.jpedal.utils.repositories.Vector_Rectangle;
import org.jpedal.utils.repositories.Vector_String;

/**
 * Applies heuristics to unstructured PDF text to create content
 */
public class PdfGroupingAlgorithms {

	private boolean includeHTMLtags = false;

	public static final int USER_DEFINED_LIST_ONLY = 0;
	public static final int SURROUND_BY_ANY_PUNCTUATION = 1;

	private static String SystemSeparator = System.getProperty("line.separator");

	// public PdfGroupingAlgorithms() {}

	/** ==============START OF ARRAYS================ */
	/**
	 * content is stored in a set of arrays. We have tried various methods (ie create composite object, etc) and none are entirely satisfactory. The
	 * beauty of this method is speed.
	 */

	/**
	 * flag to show this item has been merged into another and should be ignored. This allows us to repeat operations on live elements without lots of
	 * deleting.
	 */
	private boolean[] isUsed;

	/** co-ords of object (x1,y1 is top left) */
	private float[] f_x1, f_x2, f_y1, f_y2;

	/** track if we removed space from end */
	private boolean[] hadSpace;

	/** hold colour info */
	private String[] f_colorTag;

	/** hold writing mode */
	private int[] writingMode;

	/** hold move type */
	private int[] moveType;

	/** font sizes in pixels */
	private int[] fontSize;

	/** amount of space a space uses in this font/size */
	private float[] spaceWidth;

	/** actual text */
	private StringBuilder[] content;

	/** raw number of text characters */
	private int[] textLength;

	/** ==============END OF ARRAYS================ */

	/**
	 * handle on page data object. We extract data from this into local arrays and return grouped content into object at end. This is done for speed.
	 */
	private PdfData pdf_data;

	PdfPageData pageData;

	/** flag to show if output for table is CSV or XHTML */
	private boolean isXHTML = true;

	/** slot to insert next value - used when we split fragments for table code */
	private int nextSlot;

	/** vertical breaks for table calculation */
	private Vector_Int lineBreaks = new Vector_Int();

	/** holds details as we scan lines for table */
	private Vector_Object lines;

	/** lookup table used to sort into correct order for table */
	private Vector_Int lineY2;

	/**
	 * marker char used in content (we bury location for each char so we can split)
	 */
	private static final String MARKER = PdfData.marker;
	public static char MARKER2 = MARKER.charAt(0);

	/** counters for cols and rows and pointer to final object we merge into */
	private int max_rows = 0, master = 0;

	/** flag to show color info is being extracted */
	private boolean colorExtracted = false;

	/** used to calculate correct order for table lines */
	private int[] line_order;

	/** amount we resize arrays holding content with if no space */
	private final static int increment = 100;

	public static boolean useUnrotatedCoords;

	/** end points if text located */
	private float[] endPoints;

	/** flag to show if tease created on findText */
	private boolean includeTease;

	/** teasers for findtext */
	private String[] teasers;

	private List multipleTermTeasers = new ArrayList();

	private boolean usingMultipleTerms = false;

	private boolean isXMLExtraction = true;

	/*
	 * Variables to allow cross line search results
	 */
	/** Value placed between result areas to show they are part of the same result */
	private int linkedSearchAreas = -101;

	/** create a new instance, passing in raw data */
	public PdfGroupingAlgorithms(PdfData pdf_data, PdfPageData pageData, boolean isXMLExtraction) {
		this.pdf_data = pdf_data;
		this.pageData = pageData;
		this.isXMLExtraction = isXMLExtraction;
		this.colorExtracted = pdf_data.isColorExtracted();
	}

	public static void setSeparator(String sep) {
		SystemSeparator = sep;
	}

	/**
	 * workout if we should use space, CR or no separator when joining lines
	 */
	static final private String getLineDownSeparator(StringBuilder rawLine1, StringBuilder rawLine2, boolean isXMLExtraction) {

		String returnValue = " "; // space is default

		boolean hasUnderline = false;

		/** get 2 lines without any XML or spaces so we can look at last char */
		StringBuilder line1, line2;
		if (isXMLExtraction) {
			line1 = Strip.stripXML(rawLine1, isXMLExtraction);
			line2 = Strip.stripXML(rawLine2, isXMLExtraction);
		}
		else {
			line1 = Strip.trim(rawLine1);
			line2 = Strip.trim(rawLine2);
		}

		/** get lengths and if appropriate perform tests */
		int line1Len = line1.length();
		int line2Len = line2.length();
		// System.out.println(line1Len+" "+line2Len);
		if ((line1Len > 1) && (line2Len > 1)) {

			/** get chars to test */
			char line1Char2 = line1.charAt(line1Len - 1);
			char line1Char1 = line1.charAt(line1Len - 2);
			char line2Char1 = line2.charAt(0);
			char line2Char2 = line2.charAt(1);

			// deal with hyphenation first - ignore unless :- or space-
			String hyphen_values = "";
			if (hyphen_values.indexOf(line1Char2) != -1) {
				returnValue = ""; // default of nothing
				if (line1Char1 == ':') returnValue = "\n";
				if (line1Char2 == ' ') returnValue = " ";

				// paragraph breaks if full stop and next line has ascii char or Capital Letter
			}
			else
				if (((line1Char1 == '.') | (line1Char2 == '.'))
						& (Character.isUpperCase(line2Char1) | (line2Char1 == '&') | Character.isUpperCase(line2Char2) | (line2Char2 == '&'))) {
					if (isXMLExtraction) returnValue = "\n";
					else returnValue = "\n";
				}

		}

		// add an underline if appropriate
		if (hasUnderline) {
			if (isXMLExtraction) returnValue = returnValue + "\n";
			else returnValue = returnValue + '\n';
		}

		return returnValue;
	}

	/**
	 * remove shadows from text created by double printing of text and drowned items where text inside other text
	 */
	private final void cleanupShadowsAndDrownedObjects(boolean avoidSpaces) {

		// get list of items
		int[] items = getUnusedFragments();
		int count = items.length;
		int c, n;
		String separator;
		float diff;

		// work through objects and eliminate shadows or roll together overlaps
		for (int p = 0; p < count; p++) {

			// master item
			c = items[p];

			// ignore used items
			if (this.isUsed[c] == false) {

				// work out mid point in text
				float midX = (this.f_x1[c] + this.f_x2[c]) / 2;
				float midY = (this.f_y1[c] + this.f_y2[c]) / 2;

				for (int p2 = p + 1; p2 < count; p2++) {

					// item to test against
					n = items[p2];
					if ((this.isUsed[n] == false) && (this.isUsed[c] == false)) {

						float fontDiff = this.fontSize[n] - this.fontSize[c];
						if (fontDiff < 0) fontDiff = -fontDiff;

						diff = (this.f_x2[n] - this.f_x1[n]) - (this.f_x2[c] - this.f_x1[c]);
						if (diff < 0) diff = -diff;

						/** stop spurious matches on overlapping text */
						if (fontDiff == 0 && (midX > this.f_x1[n]) && (midX < this.f_x2[n]) && (diff < 10) && (midY < this.f_y1[n])
								&& (midY > this.f_y2[n])) {

							this.isUsed[n] = true;

							// pick up drowned text items (item inside another)
						}
						else {

							boolean a_in_b = (this.f_x1[n] > this.f_x1[c]) && (this.f_x2[n] < this.f_x2[c]) && (this.f_y1[n] < this.f_y1[c])
									&& (this.f_y2[n] > this.f_y2[c]);
							boolean b_in_a = (this.f_x1[c] > this.f_x1[n]) && (this.f_x2[c] < this.f_x2[n]) && (this.f_y1[c] < this.f_y1[n])
									&& (this.f_y2[c] > this.f_y2[n]);

							// merge together
							if (a_in_b || b_in_a) {
								// get order right - bottom y2 underneath
								if (this.f_y2[c] > this.f_y2[n]) {
									separator = getLineDownSeparator(this.content[c], this.content[n], this.isXMLExtraction);
									if ((avoidSpaces == false) || (separator.indexOf(' ') == -1)) {
										merge(c, n, separator, true);
									}
								}
								else {
									separator = getLineDownSeparator(this.content[n], this.content[c], this.isXMLExtraction);
									if (!avoidSpaces || separator.indexOf(' ') == -1) {
										merge(n, c, separator, true);
									}
								}

								// recalculate as may have changed
								midX = (this.f_x1[c] + this.f_x2[c]) / 2;
								midY = (this.f_y1[c] + this.f_y2[c]) / 2;

							}
						}
					}
				}
			}
		}
	}

	/**
	 * general routine to see if we add a space between 2 text fragments
	 */
	final private String isGapASpace(int c, int l, float actualGap, boolean addMultiplespaceXMLTag, int writingMode) {
		String sep = "";
		float gap;

		// use smaller gap
		float gapA = this.spaceWidth[c] * this.fontSize[c];
		float gapB = this.spaceWidth[l] * this.fontSize[l];

		if (gapA > gapB) gap = gapB;
		else gap = gapA;

		gap = (actualGap / (gap / 1000));

		// Round values to closest full integer as float -> int conversion rounds down
		if (gap > 0.51f && gap < 1) gap = 1;

		int spaceCount = (int) gap;

		if (spaceCount > 0) sep = " ";

		/** add an XML tag to flag multiple spaces */
		if (spaceCount > 1 && addMultiplespaceXMLTag && writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) sep = " ";

		return sep;
	}

	/**
	 * merge 2 text fragments together and update co-ordinates
	 */
	final private void merge(int m, int c, String separator, boolean moveFont) {

		// update co-ords
		if (this.f_x1[m] > this.f_x1[c]) this.f_x1[m] = this.f_x1[c];
		if (this.f_y1[m] < this.f_y1[c]) this.f_y1[m] = this.f_y1[c];
		if (this.f_x2[m] < this.f_x2[c]) this.f_x2[m] = this.f_x2[c];
		if (this.f_y2[m] > this.f_y2[c]) this.f_y2[m] = this.f_y2[c];

		if (this.isXMLExtraction) {
			String test = Fonts.fe;

			// add color tag if needed and changes
			if (this.colorExtracted) test = Fonts.fe + GenericColorSpace.ce;

			// move  if needed and add separator
			if ((moveFont) && (this.content[m].toString().lastIndexOf(test) != -1)) {
				String master = this.content[m].toString();
				this.content[m] = new StringBuilder(master.substring(0, master.lastIndexOf(test)));
				this.content[m].append(separator);
				this.content[m].append(master.substring(master.lastIndexOf(test)));
			}
			else {
				this.content[m].append(separator);
			}

			// Only map out space if text length is longer than 1
			if (this.textLength[c] > 1 && this.content[m].toString().endsWith(" ")) {
				this.content[m].deleteCharAt(this.content[m].lastIndexOf(" "));
			}
			// use font size of second text (ie at end of merged text)
			this.fontSize[m] = this.fontSize[c];

			// Remove excess / redundent xml tags
			if (this.content[c].indexOf("", this.content[m].lastIndexOf("") + 7 == this.content[m].lastIndexOf(">")) {
					this.content[c].replace(this.content[c].indexOf("") + 1, "");
					this.content[m].replace(this.content[m].lastIndexOf(""), this.content[m].lastIndexOf("") + 8, "");
				}
			}

			if (this.content[c].indexOf("", this.content[m].lastIndexOf("") + 6 == this.content[m].lastIndexOf(">")) {
					this.content[c].replace(this.content[c].indexOf("") + 1, "");
					this.content[m].replace(this.content[m].lastIndexOf(""), this.content[m].lastIndexOf("") + 7, "");
				}
			}

			this.content[m] = this.content[m].append(this.content[c]);

			// track length of text less all tokens
			this.textLength[m] = this.textLength[m] + this.textLength[c];

			// set objects to null to flush and log as used
			this.isUsed[c] = true;
			this.content[c] = null;
		}
		else {

			// use font size of second text (ie at end of merged text)
			this.fontSize[m] = this.fontSize[c];

			// add together
			this.content[m] = this.content[m].append(separator).append(this.content[c]);

			// track length of text less all tokens
			this.textLength[m] = this.textLength[m] + this.textLength[c];

			// set objects to null to flush and log as used
			this.isUsed[c] = true;
			this.content[c] = null;
		}
	}

	/**
	 * remove width data we may have buried in data
	 */
	final private void removeEncoding() {

		// get list of items
		int[] items = getUnusedFragments();
		int current;

		// work through objects and eliminate shadows or roll together overlaps
		for (int item : items) {

			// master item
			current = item;

			// ignore used items and remove widths we hid in data
			if (this.isUsed[current] == false) this.content[current] = removeHiddenMarkers(current);
		}
	}

	/**
	 * put raw data into Arrays for quick merging breakup_fragments shows if we break on vertical lines and spaces
	 */
	final private void copyToArrays() {

		this.colorExtracted = this.pdf_data.isColorExtracted();

		int count = this.pdf_data.getRawTextElementCount();

		// local lists for faster access
		this.isUsed = new boolean[count];
		this.fontSize = new int[count];
		this.writingMode = new int[count];
		this.spaceWidth = new float[count];
		this.content = new StringBuilder[count];
		this.textLength = new int[count];

		this.f_x1 = new float[count];
		this.f_colorTag = new String[count];
		this.f_x2 = new float[count];
		this.f_y1 = new float[count];
		this.f_y2 = new float[count];
		this.moveType = new int[count];

		// set values
		for (int i = 0; i < count; i++) {
			this.content[i] = new StringBuilder(this.pdf_data.contents[i]);

			this.fontSize[i] = this.pdf_data.f_end_font_size[i];
			this.writingMode[i] = this.pdf_data.f_writingMode[i];
			this.f_x1[i] = this.pdf_data.f_x1[i];
			this.f_colorTag[i] = this.pdf_data.colorTag[i];
			this.f_x2[i] = this.pdf_data.f_x2[i];
			this.f_y1[i] = this.pdf_data.f_y1[i];
			this.f_y2[i] = this.pdf_data.f_y2[i];
			this.moveType[i] = this.pdf_data.move_command[i];

			this.spaceWidth[i] = this.pdf_data.space_width[i];
			this.textLength[i] = this.pdf_data.text_length[i];
		}
	}

	/**
	 * get list of unused fragments and put in list
	 */
	private int[] getUnusedFragments() {
		int total_fragments = this.isUsed.length;

		// get unused item pointers
		int ii = 0;
		int temp_index[] = new int[total_fragments];
		for (int i = 0; i < total_fragments; i++) {
			if (this.isUsed[i] == false) {
				temp_index[ii] = i;
				ii++;
			}
		}

		// put into correctly sized array
		int[] items = new int[ii];
		System.arraycopy(temp_index, 0, items, 0, ii);
		return items;
	}

	/**
	 * strip the hidden numbers of position we encoded into the data (could be coded to be faster by not using Tokenizer)
	 */
	private StringBuilder removeHiddenMarkers(int c) {

		// make sure has markers and ignore if not
		if (this.content[c].indexOf(MARKER) == -1) return this.content[c];

		// strip the markers
		StringTokenizer tokens = new StringTokenizer(this.content[c].toString(), MARKER, true);
		String temp;
		StringBuilder processedData = new StringBuilder();

		// with a token to make sure cleanup works
		while (tokens.hasMoreTokens()) {

			// strip encoding in data
			temp = tokens.nextToken(); // see if first marker

			if (temp.equals(MARKER)) {
				tokens.nextToken(); // point character starts
				tokens.nextToken(); // second marker
				tokens.nextToken(); // width
				tokens.nextToken(); // third marker

				// put back chars
				processedData = processedData.append(tokens.nextToken());

			}
			else processedData = processedData.append(temp);
		}

		return processedData;
	}

	/**
	 * sets if we include HTML in teasers (do we want this is word or this is word as teaser)
	 * 
	 * @param value
	 */
	public void setIncludeHTML(boolean value) {
		this.includeHTMLtags = value;
	}

	/**
	 * method to show data without encoding
	 */
	public static String removeHiddenMarkers(String contents) {

		// trap null
		if (contents == null) return null;

		// run though the string extracting our markers

		// make sure has markers and ignore if not
		if (!contents.contains(MARKER)) return contents;

		// strip the markers
		StringTokenizer tokens = new StringTokenizer(contents, MARKER, true);
		String temp_token=null;
		StringBuilder processed_data = new StringBuilder();
		boolean pushBackByOne = false;
		
		// with a token to make sure cleanup works
		while (tokens.hasMoreTokens()) {

		    if(!pushBackByOne) {
		        // encoding in data
		        temp_token = tokens.nextToken(); // see if first marker
		    }
		    else {
		        //skip fetching nextToken() since it was fetched in the last round
		        pushBackByOne=false;
		    }
			
			if (MARKER.equals(temp_token)) {
				tokens.nextToken(); // point character starts
				tokens.nextToken(); // second marker
				tokens.nextToken(); // width
				tokens.nextToken(); // third marker

                //Lonzak: There are PDFs which contain \0\0 (should be e.g. \0 \0 or \0c\0...) and then the lexer gets confused
                //thus do a push back
				String next = tokens.nextToken();
				if(next.equals(MARKER)) {
				    pushBackByOne=true;
				}
				else {
				    // put back chars
    				processed_data = processed_data.append(next);
				}
			}
			else {
			    // value
			    processed_data = processed_data.append(temp_token);
			}
		}
		return processed_data.toString();
	}

	/**
	 * Method to try and find vertical lines in close data (not as efficient as it could be)
	 * 
	 * @throws PdfException
	 */
	private void findVerticalLines(float minX, float minY, float maxX, float maxY, int currentWritingMode) throws PdfException {

		// hold counters on all x values
		HashMap xLines = new HashMap();

		// counter on most popular item
		int most_frequent = 0, count = this.pdf_data.getRawTextElementCount();
		float x1, x2, y1, y2;
		String raw;

		for (int i = 0; i < count; i++) {
			float currentX = 0, lastX;
			Integer intX;

			// extract values for data
			raw = this.pdf_data.contents[i];

			/**
			 * set pointers so left to right text
			 */
			if (currentWritingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
				x1 = this.f_x1[i];
				x2 = this.f_x2[i];
				y1 = this.f_y1[i];
				y2 = this.f_y2[i];
			}
			else
				if (currentWritingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
					x2 = this.f_x1[i];
					x1 = this.f_x2[i];
					y1 = this.f_y1[i];
					y2 = this.f_y2[i];
				}
				else
					if (currentWritingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
						x1 = this.f_y1[i];
						x2 = this.f_y2[i];
						y1 = this.f_x2[i];
						y2 = this.f_x1[i];
					}
					else
						if (currentWritingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
							x1 = this.f_y2[i];
							x2 = this.f_y1[i];
							y2 = this.f_x1[i];
							y1 = this.f_x2[i];
						}
						else {
							throw new PdfException("Illegal value " + currentWritingMode + "for currentWritingMode");
						}

			// if in the area, process
			if ((x1 > minX - .5) && (x2 < maxX + .5) && (y2 > minY - .5) && (y1 < maxY + .5)) {

				// run though the string extracting our markers to get x values
				StringTokenizer tokens = new StringTokenizer(raw, MARKER, true);
				String value, lastValue = "";
				Object currentValue;

				while (tokens.hasMoreTokens()) {

					// encoding in data
					value = tokens.nextToken(); // see if first marker
					if (value.equals(MARKER)) {

						value = tokens.nextToken(); // point character starts

						if (value.length() > 0) {

							lastX = currentX;
							currentX = Float.parseFloat(value);
							try {

								// add x to list or increase counter at start
								// or on space
								// add points either side of space
								if (lastValue.length() == 0 || (lastValue.indexOf(' ') != -1)) {

									intX = (int) currentX;
									currentValue = xLines.get(intX);
									if (currentValue == null) {
										xLines.put(intX, 1);
									}
									else {
										int countReached = (Integer) currentValue;
										countReached++;

										if (countReached > most_frequent) most_frequent = countReached;

										xLines.put(intX, countReached);
									}

									// work out the middle
									int middle = (int) (lastX + ((currentX - lastX) / 2));

									if (lastX != 0) {
										intX = middle;
										currentValue = xLines.get(intX);
										if (currentValue == null) {
											xLines.put(intX, 1);
										}
										else {
											int count_reached = (Integer) currentValue;
											count_reached++;

											if (count_reached > most_frequent) most_frequent = count_reached;

											xLines.put(intX, count_reached);
										}
									}
								}

							}
							catch (Exception e) {
								LogWriter.writeLog("Exception " + e + " stripping x values");
							}
						}

						tokens.nextToken(); // second marker
						tokens.nextToken(); // glyph width
						tokens.nextToken(); // third marker
						value = tokens.nextToken(); // put back chars
						lastValue = value;

					}
				}
			}
		}

		// now analyse the data
		Iterator keys = xLines.keySet().iterator();
		int minimum_needed = most_frequent / 2;

		while (keys.hasNext()) {
			Integer current_key = (Integer) keys.next();
			int current_count = (Integer) xLines.get(current_key);

			if (current_count > minimum_needed) this.lineBreaks.addElement(current_key);

		}
	}

	/**
	 * Method splitFragments adds raw frgaments to processed fragments breaking up any with vertical lines through or what looks like tabbed spaces
	 * 
	 * @throws PdfException
	 */
	private void copyToArrays(float minX, float minY, float maxX, float maxY, boolean keepFont, boolean breakOnSpace, boolean findLines,
			String punctuation, boolean isWordlist) throws PdfException {

		final boolean debugSplit = false;

		// initialise local arrays allow for extra space
		int count = this.pdf_data.getRawTextElementCount() + increment;

		this.f_x1 = new float[count];
		this.f_colorTag = new String[count];
		this.hadSpace = new boolean[count];
		this.f_x2 = new float[count];
		this.f_y1 = new float[count];
		this.f_y2 = new float[count];

		this.spaceWidth = new float[count];
		this.content = new StringBuilder[count];
		this.fontSize = new int[count];
		this.textLength = new int[count];
		this.writingMode = new int[count];
		this.isUsed = new boolean[count];
		this.moveType = new int[count];

		// flag to find lines based on orientation of first text item*/
		boolean linesScanned = false;

		// set defaults and calculate dynamic values
		int text_length;
		count = count - increment;
		float last_pt, min, max, pt, x1, x2, y1, y2, linePos, character_spacing;
		String raw, char_width = "", currentColor;
		StringBuilder text = new StringBuilder();

		// work through fragments
		for (int i = 0; i < count; i++) {

			// extract values
			character_spacing = this.pdf_data.f_character_spacing[i];
			raw = this.pdf_data.contents[i];
			x1 = this.pdf_data.f_x1[i];
			currentColor = this.pdf_data.colorTag[i];
			x2 = this.pdf_data.f_x2[i];
			y1 = this.pdf_data.f_y1[i];
			y2 = this.pdf_data.f_y2[i];
			text_length = this.pdf_data.text_length[i];
			int mode = this.pdf_data.f_writingMode[i];
			int moveType = this.pdf_data.move_command[i];

			/**
			 * see if in area
			 */
			boolean accepted = false;

			if (debugSplit) {
				System.out.println("raw data=" + raw);
				System.out.println("text data=" + PdfGroupingAlgorithms.removeHiddenMarkers(raw));
			}

			// if at least partly in the area, process
			if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) && y2 > minY && y1 < maxY && x1 < maxX
					&& x2 > minX) {
				accepted = true;
			}
			else
				if ((mode == PdfData.VERTICAL_BOTTOM_TO_TOP || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) && x1 > minX && x2 < maxX && y1 > minY
						&& y2 < maxY) accepted = true;

			if (accepted) {

				/** find lines */
				// look for possible vertical or horizontal lines in the data
				if ((!linesScanned) && (findLines)) {
					findVerticalLines(minX, minY, maxX, maxY, mode);
					linesScanned = true;
				}

				/**
				 * initialise pointers and work out an 'average character space'
				 **/
				if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
					// space = (x2 - x1) / text_length;
					pt = x1;
					last_pt = x1;
					min = minX;
					max = maxX;
				}
				else { // vertical text
						// space = (y1 - y2) / text_length;
					pt = y2;
					last_pt = y2;
					min = minY;
					max = maxY;
				}

				linePos = -1;

				/**
				 * work through text, using embedded markers to work out whether each letter is IN or OUT
				 */
				char[] line = raw.toCharArray();

				int end = line.length;
				int pointer = 0;

				String value, textValue = "", pt_reached;

				// allow for no tokens and return all text fragment
				if (!raw.contains(MARKER)) text = new StringBuilder(raw);

				boolean isFirstValue = true, breakPointset = false;

				/**
				 * work through text, using embedded markers to work out whether each letter is IN or OUT
				 */
				while (pointer < end) {

					// only data between min and y locations
					while (true) {

						/**
						 * read value
						 */

						if (line[pointer] != MARKER2) {
							// find second marker and get width
							int startPointer = pointer;
							while ((pointer < end) && (line[pointer] != MARKER2))
								pointer++;
							value = raw.substring(startPointer, pointer);

						}
						else {// if (value.equals(MARKER)) { // read the next token and its location and width

							// find first marker
							while ((pointer < end) && (line[pointer] != MARKER2))
								pointer++;

							pointer++;

							// find second marker and get width
							int startPointer = pointer;
							while ((pointer < end) && (line[pointer] != MARKER2))
								pointer++;
							pt_reached = raw.substring(startPointer, pointer);
							pointer++;

							// find third marker
							startPointer = pointer;
							while ((pointer < end) && (line[pointer] != MARKER2))
								pointer++;

							char_width = raw.substring(startPointer, pointer);
							pointer++;

							// find next marker
							startPointer = pointer;
							while ((pointer < end) && (line[pointer] != MARKER2))
								pointer++;

							value = raw.substring(startPointer, pointer);

							textValue = value; // keep value with no spaces

							if (pt_reached.length() > 0) { // set point character starts
								last_pt = pt;
								pt = Float.parseFloat(pt_reached);

								if (breakPointset) {
									if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) x1 = pt;
									else
										if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) x2 = pt;
										else
											if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) y2 = pt;
											else
												if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) y1 = pt;
									breakPointset = false;
								}
							}

							// add font start if needed
							if ((this.isXMLExtraction) && (last_pt < min) && (pt > min) && (!value.startsWith(Fonts.fb))) value = Fonts
									.getActiveFontTag(raw, "") + value;

						}

						if ((pt > min) & (pt < max)) {
							if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) if ((x1 < min || x1 > max) && pt >= min) x1 = pt;
							else
								if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) if ((x2 > max || x2 < min) && pt <= max) x2 = pt;
								else
									if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) if ((y2 < min || y2 > max) && pt >= min) y2 = pt;
									else
										if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) if ((y1 < min || y1 > max) && pt <= min) y1 = pt;
							break;
						}

						value = "";
						textValue = "";

						if (pointer >= end) break;
					}

					/** make sure font not sliced off on first value */
					if ((isFirstValue)) {

						isFirstValue = false;
						if ((this.isXMLExtraction) && (keepFont) && (!value.startsWith(Fonts.fb)) && (!value.startsWith(GenericColorSpace.cb))) // &&(!text.toString().startsWith(Fonts.fb))))
						text.append(Fonts.getActiveFontTag(text.toString(), raw));
					}

					/**
					 * we now have a valid value inside the selected area so perform tests
					 */
					// see if a break occurs
					boolean is_broken = false;
					if (findLines && character_spacing > 0 && text.toString().endsWith(" ")) {
						int counts = this.lineBreaks.size();
						for (int jj = 0; jj < counts; jj++) {
							int test_x = this.lineBreaks.elementAt(jj);
							if ((last_pt < test_x) & (pt > test_x)) {
								jj = counts;
								is_broken = true;
							}
						}
					}

					boolean endsWithPunctuation = checkForPunctuation(textValue, punctuation);

					if (is_broken) { // break on double-spaces or larger

						if (debugSplit) System.out.println("Break 1 is_broken");

						float Nx1 = x1, Nx2 = x2, Ny1 = y1, Ny2 = y2;
						if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) Nx2 = last_pt + Float.parseFloat(char_width);
						else
							if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) Nx1 = last_pt + Float.parseFloat(char_width);
							else
								if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) Ny1 = last_pt + Float.parseFloat(char_width);
								else
									if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) Ny2 = last_pt + Float.parseFloat(char_width);

						addFragment(moveType, i, text, Nx1, Nx2, Ny1, Ny2, text_length, keepFont, currentColor, isWordlist);
						text = new StringBuilder(Fonts.getActiveFontTag(text.toString(), raw));
						text.append(value);

						if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) x1 = pt;
						else
							if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) x2 = pt;
							else
								if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) y2 = pt;
								else
									if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) y1 = pt;

					}
					else
						if ((endsWithPunctuation) | ((breakOnSpace) && ((textValue.indexOf(' ') != -1) || (value.endsWith(" "))))
								| ((textValue.contains("   ")))) {// break on double-spaces or larger
							if (debugSplit) System.out.println("Break 2 endsWithPunctuation=" + endsWithPunctuation + " textValue=" + textValue + '<'
									+ " value=" + value + '<' + " text=" + text + '<');

							// Remove final bit of the below if to fix issue in case 11542
							if (textValue.length() > 1 && textValue.indexOf(' ') != -1) {// && x1==pt){ //add in space values to start of next shape
								// count the spaces
								int ptr = textValue.indexOf(' ');

								if (ptr > 0) {
									pt = pt + ptr * (Float.parseFloat(char_width) / textValue.length());
								}
								// else
								// pt=pt+Float.parseFloat(char_width);

							}

							if (!endsWithPunctuation) text.append(value.trim());

							if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {

								if (debugSplit) System.out.println("Add " + x1 + ' ' + pt + " text=" + text + " i=" + i);
								addFragment(moveType, i, text, x1, pt, y1, y2, text_length, keepFont, currentColor, isWordlist);
							}
							else
								if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
									if (debugSplit) System.out.println("b");
									addFragment(moveType, i, text, pt, x2, y1, y2, text_length, keepFont, currentColor, isWordlist);
								}
								else
									if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
										if (debugSplit) System.out.println("c");
										addFragment(moveType, i, text, x1, x2, pt, y2, text_length, keepFont, currentColor, isWordlist);
									}
									else
										if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
											if (debugSplit) System.out.println("d");
											addFragment(moveType, i, text, x1, x2, y1, pt, text_length, keepFont, currentColor, isWordlist);
										}

							if (char_width.length() > 0) { // add in space values to start of next shape
								// count the spaces
								int ptr = 0;

								if (textValue.indexOf(' ') != -1) ptr = textValue.indexOf(' ');

								if (isWordlist) {
									int len = textValue.length();
									while (ptr < len && textValue.charAt(ptr) == ' ') {
										ptr++;
									}
								}

								if (ptr > 0) pt = pt + ptr * Float.parseFloat(char_width);
								else pt = pt + Float.parseFloat(char_width);

								if (ptr > 0) breakPointset = true;
								else breakPointset = false;

							}

							// store fact it had a space in case we generate wordlist
							if ((breakOnSpace) & (this.nextSlot > 0)) this.hadSpace[this.nextSlot - 1] = true;

							text = new StringBuilder(Fonts.getActiveFontTag(text.toString(), raw));
							if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) x1 = pt;// + space;
							else
								if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) x2 = pt;// - space;
								else
									if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) y2 = pt;// + space;
									else
										if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) y1 = pt;// - space;

						}
						else
							if ((linePos != -1) & (pt > linePos)) {// break on a vertical line

								if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) addFragment(moveType, i, text, x1, linePos, y1, y2, text_length,
										keepFont, currentColor, isWordlist);
								else
									if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) addFragment(moveType, i, text, linePos, x2, y1, y2, text_length,
											keepFont, currentColor, isWordlist);
									else
										if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) addFragment(moveType, i, text, x1, x2, linePos, y2, text_length,
												keepFont, currentColor, isWordlist);
										else
											if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) addFragment(moveType, i, text, x1, x2, y1, linePos,
													text_length, keepFont, currentColor, isWordlist);

								text = new StringBuilder(Fonts.getActiveFontTag(text.toString(), raw));
								text.append(value);

								if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) x1 = linePos;
								else
									if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) x2 = linePos;
									else
										if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) y2 = linePos;
										else
											if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) y1 = linePos;

								linePos = -1;

							}
							else { // allow for space used as tab
								if ((this.isXMLExtraction) && (value.endsWith(' ' + Fonts.fe))) {
									value = Fonts.fe;
									textValue = "";

									if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) x2 = last_pt;
									else
										if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) x1 = last_pt;
										else
											if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) y1 = last_pt;
											else
												if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) y2 = last_pt;
								}
								text.append(value);
							}

				}

				// trap scenario we found if all goes through with no break at end
				if ((keepFont) && (this.isXMLExtraction) && (!text.toString().endsWith(Fonts.fe))
						&& (!text.toString().endsWith(GenericColorSpace.ce))) text.append(Fonts.fe);

				// create new line with what is left and output
				if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
					if (x1 < x2) addFragment(moveType, i, text, x1, x2, y1, y2, text_length, keepFont, currentColor, isWordlist);
				}
				else
					if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
						if (y1 > y2) addFragment(moveType, i, text, x1, x2, y1, y2, text_length, keepFont, currentColor, isWordlist);
					}
				text = new StringBuilder();

			}
		}

		// local lists for faster access
		this.isUsed = new boolean[this.nextSlot];
	}

	/**
	 * @param textValue
	 */
	private static boolean checkForPunctuation(String textValue, String punctuation) {

		if (punctuation == null) return false;

		/** see if ends with punctuation */
		boolean endsWithPunctuation = false;
		int textLength = textValue.length();
		int ii = textLength - 1;
		if (textLength > 0) { // strip any spaces and tags in test
			char testChar = textValue.charAt(ii);
			boolean inTag = (testChar == '>');
			while (((inTag) | (testChar == ' ')) & (ii > 0)) {

				if (testChar == '<') inTag = false;

				ii--;
				testChar = textValue.charAt(ii);

				if (testChar == '>') inTag = true;
			}

			// stop matches on &;
			if ((testChar == ';')) {
				// ignore if looks like &xxx;
				endsWithPunctuation = true;
				ii--;
				while (ii > -1) {

					testChar = textValue.charAt(ii);
					if (testChar == '&' || testChar == '#') {
						endsWithPunctuation = false;
						ii = 0;
					}

					if (ii == 0 || testChar == ' ' || !Character.isLetterOrDigit(testChar)) break;

					ii--;
				}
			}
			else
				if (punctuation.indexOf(testChar) != -1) endsWithPunctuation = true;

		}
		return endsWithPunctuation;
	}

	/**
	 * add an object to our new XML list
	 */
	private void addFragment(int moveType, int index, StringBuilder contentss, float x1, float x2, float y1, float y2, int text_len,
			boolean keepFontTokens, String currentColorTag, boolean isWordlist) {

		StringBuilder current_text = contentss;
		String str = current_text.toString();

		// strip <> or ascii equivalents
		if (isWordlist) {
			if (str.contains("&#")) current_text = Strip.stripAmpHash(current_text);

			if ((this.isXMLExtraction) && ((str.contains("<")) || (str.contains(">")))) current_text = Strip.stripXMLArrows(current_text, true);
			else
				if ((!this.isXMLExtraction) && ((str.indexOf('<') != -1) || (str.indexOf('>') != -1))) current_text = Strip.stripArrows(current_text);
		}

		// StringBuilder justText=Strip.stripXML(current_text);

		// ignore blank space objects
		// if (justText.length() == 0) {

		if (getFirstChar(current_text) != -1) {

			// strip tags or pick up missed  if ends with space
			if (keepFontTokens == false) {

				// strip fonts if required
				current_text = Strip.stripXML(current_text, this.isXMLExtraction);

			}
			else
				if (this.isXMLExtraction) {

					// no color tag
					if (this.pdf_data.isColorExtracted() && (!current_text.toString().endsWith(GenericColorSpace.ce))) {

						// se
						// if ends  add 
						// otherwise add 
						if (!current_text.toString().endsWith(Fonts.fe)) current_text = current_text.append(Fonts.fe);
						current_text = current_text.append(GenericColorSpace.ce);

					}
					else
						if ((!this.pdf_data.isColorExtracted()) && (!current_text.toString().endsWith(Fonts.fe))) current_text = current_text
								.append(Fonts.fe);
				}

			// add to vacant slot or create new slot
			int count = this.f_x1.length;

			if (this.nextSlot < count) {

				this.f_x1[this.nextSlot] = x1;
				this.f_colorTag[this.nextSlot] = currentColorTag;
				this.f_x2[this.nextSlot] = x2;
				this.f_y1[this.nextSlot] = y1;
				this.f_y2[this.nextSlot] = y2;
				this.moveType[this.nextSlot] = moveType;

				this.fontSize[this.nextSlot] = this.pdf_data.f_end_font_size[index];
				this.writingMode[this.nextSlot] = this.pdf_data.f_writingMode[index];
				this.textLength[this.nextSlot] = text_len;

				this.spaceWidth[this.nextSlot] = this.pdf_data.space_width[index];
				this.content[this.nextSlot] = current_text;

				this.nextSlot++;
			}
			else {
				count = count + increment;
				float[] t_x1 = new float[count];
				String[] t_colorTag = new String[count];
				float[] t_x2 = new float[count];
				float[] t_y1 = new float[count];
				float[] t_y2 = new float[count];
				float[] t_spaceWidth = new float[count];

				StringBuilder[] t_content = new StringBuilder[count];

				int[] t_font_size = new int[count];
				int[] t_text_len = new int[count];
				int[] t_writingMode = new int[count];

				int[] t_moveType = new int[count];

				boolean[] t_isUsed = new boolean[count];

				boolean[] t_hadSpace = new boolean[count];

				// copy in existing
				for (int i = 0; i < count - increment; i++) {
					t_x1[i] = this.f_x1[i];
					t_colorTag[i] = this.f_colorTag[i];
					t_x2[i] = this.f_x2[i];
					t_y1[i] = this.f_y1[i];
					t_y2[i] = this.f_y2[i];
					t_hadSpace[i] = this.hadSpace[i];
					t_spaceWidth[i] = this.spaceWidth[i];
					t_content[i] = this.content[i];
					t_font_size[i] = this.fontSize[i];
					t_writingMode[i] = this.writingMode[i];
					t_text_len[i] = this.textLength[i];
					t_isUsed[i] = this.isUsed[i];
					t_moveType[i] = this.moveType[i];
				}

				this.f_x1 = t_x1;
				this.f_colorTag = t_colorTag;
				this.hadSpace = t_hadSpace;
				this.f_x2 = t_x2;
				this.f_y1 = t_y1;
				this.f_y2 = t_y2;
				this.isUsed = t_isUsed;

				this.fontSize = t_font_size;
				this.writingMode = t_writingMode;
				this.textLength = t_text_len;

				this.spaceWidth = t_spaceWidth;

				this.content = t_content;

				this.moveType = t_moveType;

				this.f_x1[this.nextSlot] = x1;
				this.f_colorTag[this.nextSlot] = currentColorTag;
				this.f_x2[this.nextSlot] = x2;
				this.f_y1[this.nextSlot] = y1;
				this.f_y2[this.nextSlot] = y2;

				this.fontSize[this.nextSlot] = this.pdf_data.f_end_font_size[index];
				this.writingMode[this.nextSlot] = this.pdf_data.f_writingMode[index];
				t_text_len[this.nextSlot] = text_len;
				this.content[this.nextSlot] = current_text;

				this.spaceWidth[this.nextSlot] = this.pdf_data.space_width[index];

				this.moveType[this.nextSlot] = moveType;

				this.nextSlot++;

			}

		}
	}

	// ////////////////////////////////////////////////////////////////////
	/**
	 * put rows together into one object with start and end
	 */
	private void mergeTableRows(int border_width) {

		// merge row contents
		String separator = "\n";

		if (this.isXHTML == false) separator = "\n";

		this.master = ((Vector_Int) this.lines.elementAt(this.line_order[0])).elementAt(0);

		int item;
		for (int rr = 1; rr < this.max_rows; rr++) {

			item = ((Vector_Int) this.lines.elementAt(this.line_order[rr])).elementAt(0);
			if (this.content[this.master] == null) this.master = item;
			else
				if (this.content[item] != null) merge(this.master, item, separator, false);
		}

		// add start/end marker
		if (this.isXHTML) {
			if (border_width == 0) {
				this.content[this.master].insert(0, "\n");
				this.content[this.master].append("\n\n");
			}
			else {
				StringBuilder startTag = new StringBuilder("\n");
				startTag.append(this.content[this.master]);
				this.content[this.master] = startTag;
				this.content[this.master].append("\n\n");
			}
		}
	}

	// ////////////////////////////////////////////////
	/**
	 * get list of unused fragments and put in list and sort in sorted_items
	 */
	final private int[] getsortedUnusedFragments(boolean sortOnX, boolean use_y1) {
		int total_fragments = this.isUsed.length;

		// get unused item pointers
		int ii = 0;
		int sorted_temp_index[] = new int[total_fragments];
		for (int i = 0; i < total_fragments; i++) {
			if (this.isUsed[i] == false) {
				sorted_temp_index[ii] = i;
				ii++;
			}
		}

		int[] unsorted_items = new int[ii];
		int[] sorted_items;
		int[] sorted_temp_x1 = new int[ii];
		int[] sorted_temp_y1 = new int[ii];
		int[] sorted_temp_y2 = new int[ii];

		// put values in array and get x/y for sort
		for (int pointer = 0; pointer < ii; pointer++) {
			int i = sorted_temp_index[pointer];
			unsorted_items[pointer] = i;

			sorted_temp_x1[pointer] = (int) this.f_x1[i];

			// negative values to get sort in 'wrong' order from top of page
			sorted_temp_y1[pointer] = (int) this.f_y1[i];
			sorted_temp_y2[pointer] = (int) this.f_y2[i];

		}

		// sort
		if (sortOnX == false) {
			if (use_y1 == true) sorted_items = Sorts.quicksort(sorted_temp_y1, sorted_temp_x1, unsorted_items);
			else sorted_items = Sorts.quicksort(sorted_temp_y2, sorted_temp_x1, unsorted_items);
		}
		else sorted_items = Sorts.quicksort(sorted_temp_x1, sorted_temp_y1, unsorted_items);

		return sorted_items;
	}

	// ////////////////////////////////////////////////////////////////////
	/**
	 * create rows of data from preassembled indices, adding separators. Each row is built to a temp array and then row created - we don't know how
	 * many columns until the table is built
	 * 
	 * @throws PdfException
	 */
	private void createTableRows(boolean keep_alignment_information, boolean keep_width_information, int currentWritingMode) throws PdfException {

		/**
		 * create local copies of arrays
		 */
		float[] f_x1, f_x2;

		/**
		 * set pointers so left to right text
		 */
		if (currentWritingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
			f_x1 = this.f_x1;
			f_x2 = this.f_x2;
			// f_y1=this.f_y1;
			// f_y2=this.f_y2;
		}
		else
			if (currentWritingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
				f_x2 = this.f_x1;
				f_x1 = this.f_x2;
				// f_y1=this.f_y1;
				// f_y2=this.f_y2;
			}
			else
				if (currentWritingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
					f_x1 = this.f_y2;
					f_x2 = this.f_y1;
					// f_y1=this.f_x2;
					// f_y2=this.f_x1;
				}
				else
					if (currentWritingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
						f_x1 = this.f_y1;
						f_x2 = this.f_y2;
						// f_y2=this.f_x1;
						// f_y1=this.f_x2;

						/**
						 * fiddle x,y co-ords so it works
						 */

						// get max size
						int maxX = 0;
						for (float aF_x1 : f_x1) {
							if (maxX < aF_x1) maxX = (int) aF_x1;
						}

						maxX++; // allow for fp error
						// turn around
						for (int ii = 0; ii < f_x2.length; ii++) {
							f_x1[ii] = maxX - f_x1[ii];
							f_x2[ii] = maxX - f_x2[ii];
						}

					}
					else {
						throw new PdfException("Illegal value " + currentWritingMode + "for currentWritingMode");
					}

		int item, i;// , current_col = -1;

		int itemsInTable = 0, items_added = 0;
		// pointer to current element on each row
		int[] currentItem = new int[this.max_rows];

		Vector_Int[] rowContents = new Vector_Int[this.max_rows];
		Vector_String alignments = new Vector_String(); // text alignment
		Vector_Float widths = new Vector_Float(); // cell widths
		Vector_Float cell_x1 = new Vector_Float(); // cell widths
		String separator = "", empty_cell = " ";

		if (this.isXHTML == false) {
			separator = "\",\"";
			empty_cell = "";
		}

		/**
		 * set number of items on each line, column count and populate empty rows
		 */
		int[] itemCount = new int[this.max_rows];
		for (i = 0; i < this.max_rows; i++) {
			itemCount[i] = ((Vector_Int) this.lines.elementAt(i)).size() - 1;

			// total number of items
			itemsInTable = itemsInTable + itemCount[i];

			// reset other values
			currentItem[i] = 0;
			rowContents[i] = new Vector_Int(20);
		}

		// now work through and split any overlapping items until all done
		while (true) {

			// size of column and pointers
			float x1 = 9999, min_x2 = 9999, x2, current_x1, current_x2, c_x1, next_x1 = 9999, c_x2, items_in_column = 0;

			boolean all_done = true; // flag to exit at end
			float total_x1 = 0, total_x2 = 0, left_gap = 0, right_gap;

			String alignment = "center";

			if (items_added < itemsInTable) {

				/**
				 * work out cell x boundaries on basis of objects
				 */
				for (i = 0; i < this.max_rows; i++) { // get width for column
					if (itemCount[i] > currentItem[i]) { // item id

						item = ((Vector_Int) this.lines.elementAt(i)).elementAt(currentItem[i]);
						current_x1 = f_x1[item];
						current_x2 = f_x2[item];

						if (current_x1 < x1) // left margin
						x1 = current_x1;
						if (current_x2 < min_x2) // right margin if appropriate
						min_x2 = current_x2;

					}
				}

				cell_x1.addElement(x1); // save left margin
				x2 = min_x2; // set default right margin

				/**
				 * workout end and next column start by scanning all items
				 */
				for (i = 0; i < this.max_rows; i++) { // slot the next item on each row together work out item
					item = ((Vector_Int) this.lines.elementAt(i)).elementAt(currentItem[i]);
					c_x1 = f_x1[item];
					c_x2 = f_x2[item];

					// max item width of this column
					if ((c_x1 >= x1) & (c_x1 < min_x2) & (c_x2 > x2)) x2 = c_x2;

					if (currentItem[i] < itemCount[i]) { // next left margin

						item = ((Vector_Int) this.lines.elementAt(i)).elementAt(currentItem[i] + 1);
						current_x1 = f_x1[item];
						if ((current_x1 > min_x2) & (current_x1 < next_x1)) next_x1 = current_x1;
					}
				}

				// stop infinite loop case
				if (x1 == x2) break;

				// allow for last column
				if (next_x1 == 9999) next_x1 = x2;

				/**
				 * count items in table and workout raw totals for alignment. Also work out widest x2 in column
				 */
				for (i = 0; i < this.max_rows; i++) { // slot the next item on each row together

					// work out item
					item = ((Vector_Int) this.lines.elementAt(i)).elementAt(currentItem[i]);
					c_x1 = f_x1[item];
					c_x2 = f_x2[item];

					// use items in first column of single colspan
					if ((c_x1 >= x1) & (c_x1 < min_x2) & (c_x2 <= next_x1)) {

						// running totals to calculate alignment
						total_x1 = total_x1 + c_x1;
						total_x2 = total_x2 + c_x2;
						items_in_column++;

					}
				}

				/**
				 * work out gap and include empty space between cols and save
				 */
				if (i == 0) left_gap = x1;
				if (next_x1 == -1) right_gap = 0;
				else right_gap = (int) ((next_x1 - x2) / 2);

				int width = (int) (x2 - x1 + right_gap + left_gap);
				// noinspection UnusedAssignment,UnusedAssignment
				left_gap = right_gap;
				widths.addElement(width);

				/** workout the alignment */
				float x1_diff = (total_x1 / items_in_column) - x1;
				float x2_diff = x2 - (total_x2 / items_in_column);
				if (x1_diff < 1) alignment = "left";
				else
					if (x2_diff < 1) alignment = "right";
				alignments.addElement(alignment);

				for (i = 0; i < this.max_rows; i++) { // slot the next item on each row together
					this.master = ((Vector_Int) this.lines.elementAt(i)).elementAt(0);
					// get next item on line or -1 for no more
					if (itemCount[i] > currentItem[i]) {
						// work out item
						item = ((Vector_Int) this.lines.elementAt(i)).elementAt(currentItem[i]);
						c_x1 = f_x1[item];
						c_x2 = f_x2[item];
						all_done = false;

					}
					else {
						item = -1;
						c_x1 = -1;
						c_x2 = -1;
					}

					if ((item == -1) & (items_added <= itemsInTable)) {
						// all items in table so just filling in gaps
						rowContents[i].addElement(-1);

					}
					else
						if ((c_x1 >= x1) & (c_x1 < x2)) {
							// fits into cell so add in and roll on marker

							rowContents[i].addElement(item);
							currentItem[i]++;

							items_added++;
						}
						else
							if (c_x1 > x2) { // empty cell
								rowContents[i].addElement(-1);
							}
				}
			}
			if (all_done) break;
		}

		// ===================================================================
		/**
		 * now assemble rows
		 */
		for (int row = 0; row < this.max_rows; row++) {
			StringBuilder line_content = new StringBuilder(100);

			int count = rowContents[row].size() - 1;
			this.master = ((Vector_Int) this.lines.elementAt(row)).elementAt(0);

			for (i = 0; i < count; i++) {
				item = rowContents[row].elementAt(i);

				if (this.isXHTML) {

					// get width
					float current_width = widths.elementAt(i);
					String current_alignment = alignments.elementAt(i);
					int test, colspan = 1, pointer = i + 1;

					if (item != -1) {

						// look for colspan
						while (true) {
							test = rowContents[row].elementAt(i + 1);
							if ((test != -1) | (count == i + 1)) break;

							// break if over another col - roll up single value on line
							if ((itemCount[row] > 1) & (cell_x1.elementAt(i + 1) > f_x2[item])) break;

							count--;
							rowContents[row].removeElementAt(i + 1);
							colspan++;

							// update width
							current_width = current_width + widths.elementAt(pointer);
							pointer++;
						}
					}
					line_content.append(" 1) line_content.append(" colspan='").append(colspan).append('\'');
					}

					if (keep_width_information) line_content.append(" width='").append((int) current_width).append('\'');

					line_content.append(" nowrap>");
					if (item == -1) line_content.append(empty_cell);
					else line_content.append(this.content[item]);
					line_content.append("");

				}
				else { // csv
					if (item == -1) // empty col
					line_content.append("\"\",");
					else { // value
						line_content.append('\"');
						line_content.append(this.content[item]);
						line_content.append("\",");
					}
				}

				// merge to update other values
				if ((item != -1) && (this.master != item)) // merge tracks the shape
				merge(this.master, item, separator, false);

			}
			// substitute our 'hand coded' value
			this.content[this.master] = line_content;

		}
	}

	/**
	 * work through data and create a set of rows and return an object with refs for each line
	 * 
	 * @throws PdfException
	 */
	private void createLinesInTable(int itemCount, int[] items, boolean addSpaceXMLTag, int mode) throws PdfException {

		/**
		 * reverse order if text right to left
		 */
		if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) items = reverse(items);

		/**
		 * create and populate local copies of arrays
		 */
		float[] f_x1, f_x2, f_y1, f_y2;

		// set pointers so always left to right text
		switch (mode) {
			case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
				f_x1 = this.f_x1;
				f_x2 = this.f_x2;
				f_y1 = this.f_y1;
				f_y2 = this.f_y2;
				break;

			case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
				f_x2 = this.f_x1;
				f_x1 = this.f_x2;
				f_y1 = this.f_y1;
				f_y2 = this.f_y2;
				break;

			case PdfData.VERTICAL_BOTTOM_TO_TOP:
				f_x1 = this.f_y1;
				f_x2 = this.f_y2;
				f_y1 = this.f_x2;
				f_y2 = this.f_x1;
				break;

			case PdfData.VERTICAL_TOP_TO_BOTTOM:
				f_x1 = this.f_y2;
				f_x2 = this.f_y1;
				f_y2 = this.f_x1;
				f_y1 = this.f_x2;
				items = this.getsortedUnusedFragments(false, true);
				items = reverse(items);
				break;

			default:
				throw new PdfException("Illegal value " + mode + "for currentWritingMode");
		}

		// holds line we're working on
		Vector_Int current_line;

		for (int j = 0; j < itemCount; j++) { // for all items

			int c = items[j], id = -1, i, last = c;
			float smallest_gap = -1, gap, yMidPt;

			if (!this.isUsed[c] && this.writingMode[c] == mode) {

				// reset pointer and add this element
				current_line = new Vector_Int(20);
				current_line.addElement(c);
				this.lineY2.addElement((int) f_y2[c]);

				// look for items along same line (already sorted into order left to right)
				while (true) { // look for a match
					for (int ii = 0; ii < itemCount; ii++) {

						i = items[ii];

						if (!this.isUsed[i]
								&& i != c
								&& this.writingMode[c] == mode
								&& ((f_x1[i] > f_x1[c] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM) || (f_x1[i] < f_x1[c] && mode == PdfData.VERTICAL_TOP_TO_BOTTOM))) { // see
																																										// if
																																										// on
																																										// right

							gap = (f_x1[i] - f_x2[c]);

							if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) gap = -gap;

							// allow for fp error
							if (gap < 0 && gap > -2) gap = 0;

							// make sure on right
							yMidPt = (f_y1[i] + f_y2[i]) / 2;

							// see if line & if only or better fit
							if (yMidPt < f_y1[c] && yMidPt > f_y2[c] && (smallest_gap < 0 || gap < smallest_gap)) {
								smallest_gap = gap;
								id = i;
							}
						}
					}

					if (id == -1) // exit when no more matches
					break;

					// merge in best match if fit found with last or if overlaps by less than half a space,otherwise join
					float t = f_x1[id] - f_x2[last], possSpace = f_x1[id] - f_x2[c];
					float av_char1 = (float) 1.5 * ((f_x2[id] - f_x1[id]) / this.textLength[id]);
					float av_char2 = (float) 1.5 * ((f_x2[last] - f_x1[last]) / this.textLength[last]);

					if ((mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM)) {
						possSpace = -possSpace;
						t = -t;
						av_char1 = -av_char1;
						av_char2 = -av_char2;
					}

					if (t < av_char1 && t < av_char2) {
						merge(last, id, isGapASpace(id, last, possSpace, addSpaceXMLTag, mode), true);
					}
					else {
						current_line.addElement(id);
						last = id;
					}

					// flag used and reset variables used
					this.isUsed[id] = true;
					id = -1;
					smallest_gap = 1000000;

				}

				// add line to list
				this.lines.addElement(current_line);
				this.max_rows++;
			}
		}
	}

	/**
	 * 
	 * calls various low level merging routines on merge -
	 * 
	 * isCSV sets if output is XHTML or CSV format -
	 * 
	 * XHTML also has options to include font tags (keepFontInfo), preserve widths (keepWidthInfo), try to preserve alignment (keepAlignmentInfo), and
	 * set a table border width (borderWidth) - AddCustomTags should always be set to false
	 * 
	 * @param x1
	 *            is the x coord of the top left corner
	 * @param y1
	 *            is the y coord of the top left corner
	 * @param x2
	 *            is the x coord of the bottom right corner
	 * @param y2
	 *            is the y coord of the bottom right corner
	 * @param pageNumber
	 *            is the page you wish to extract from
	 * @param isCSV
	 *            is a boolean. If false the output is xhtml if true the text is out as CSV
	 * @param keepFontInfo
	 *            if true and isCSV is false keeps font information in extrated text.
	 * @param keepWidthInfo
	 *            if true and isCSV is false keeps width information in extrated text.
	 * @param keepAlignmentInfo
	 *            if true and isCSV is false keeps alignment information in extrated text.
	 * @param borderWidth
	 *            is the width of the border for xhtml
	 * @return Map containing text found in estimated table cells
	 * @throws PdfException
	 *             If the co-ordinates are not valid
	 */
	public final Map extractTextAsTable(int x1, int y1, int x2, int y2, int pageNumber, boolean isCSV, boolean keepFontInfo, boolean keepWidthInfo,
			boolean keepAlignmentInfo, int borderWidth) throws PdfException {

		// check in correct order and throw exception if not
		int[] v = validateCoordinates(x1, y1, x2, y2);
		x1 = v[0];
		y1 = v[1];
		x2 = v[2];
		y2 = v[3];

		/** return the content as an Element */
		Map table_content = new HashMap();

		LogWriter.writeLog("extracting Text As Table");

		// flag type of table so we can add correct separators
		if (isCSV == true) {
			this.isXHTML = false;
		}
		else {
			this.isXHTML = true;
		}

		// init table variables
		this.lines = new Vector_Object(20);
		this.lineY2 = new Vector_Int(20);
		this.max_rows = 0;

		// init store for data
		copyToArrays(x1, y2, x2, y1, keepFontInfo, false, true, null, false);

		// initial grouping and delete any hidden text
		removeEncoding();

		// eliminate shadows and also merge overlapping text
		cleanupShadowsAndDrownedObjects(false);

		int[] items = this.getsortedUnusedFragments(true, false);
		int item_count = items.length; // number of items

		if (item_count == 0) return table_content;

		/**
		 * check orientation and get preferred. Items not correct will be ignored
		 */
		int writingMode = getWritingMode(items, item_count);

		String message = "Table Merging algorithm being applied " + (item_count) + " items";
		LogWriter.writeLog(message);

		/**
		 * scan all items joining best fit to right of each fragment to build lines
		 */
		if (item_count > 1) {

			// workout the raw lines
			createLinesInTable(item_count, items, this.isXHTML, writingMode);

			/**
			 * generate lookup with lines in correct order (minus used to get correct order down the page)
			 */
			int dx = 1;
			if (writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || writingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) dx = -1;

			this.line_order = new int[this.max_rows];
			int[] line_y = new int[this.max_rows];

			for (int i = 0; i < this.max_rows; i++) {
				line_y[i] = dx * this.lineY2.elementAt(i);
				this.line_order[i] = i;
			}

			this.line_order = Sorts.quicksort(line_y, this.line_order);

			// assemble the rows and columns
			createTableRows(keepAlignmentInfo, keepWidthInfo, writingMode);

			// assemble the rows and columns
			mergeTableRows(borderWidth);

		}

		this.content[this.master] = cleanup(this.content[this.master]);

		String processed_value = this.content[this.master].toString();

		if (processed_value != null) {

			// cleanup data if needed by removing duplicate font tokens
			if (!isCSV) processed_value = Fonts.cleanupTokens(processed_value);

			table_content.put("content", processed_value);
			table_content.put("x1", String.valueOf(x1));
			table_content.put("x2", String.valueOf(x2));
			table_content.put("y1", String.valueOf(y1));
			table_content.put("y2", String.valueOf(y2));
		}

		return table_content;
	}

	/** make sure co-ords valid and throw exception if not */
	private static int[] validateCoordinates(int x1, int y1, int x2, int y2) {
		if ((x1 > x2) | (y1 < y2)) {

			// String errorMessage = "Invalid parameters for text rectangle. ";
			if (x1 > x2) {
				// errorMessage =
				// errorMessage
				// + "x1 value ("
				// + x1
				// + ") must be LESS than x2 ("
				// + x2
				// + "). ";
				int temp = x1;
				x1 = x2;
				x2 = temp;
				LogWriter.writeLog("x1 > x2, coordinates were swapped to validate");
			}

			if (y1 < y2) {
				// errorMessage =
				// errorMessage
				// + "y1 value ("
				// + y1
				// + ") must be MORE than y2 ("
				// + y2
				// + "). ";
				int temp = y1;
				y1 = y2;
				y2 = temp;
				LogWriter.writeLog("y1 < y2, coordinates were swapped to validate");
			}
			// throw new PdfException(errorMessage);
		}
		return new int[] { x1, y1, x2, y2 };
	}

	/**
	 * 
	 * algorithm to place data from within coordinates to a vector of word, word coords (x1,y1,x2,y2)
	 * 
	 * @param x1
	 *            is the x coord of the top left corner
	 * @param y1
	 *            is the y coord of the top left corner
	 * @param x2
	 *            is the x coord of the bottom right corner
	 * @param y2
	 *            is the y coord of the bottom right corner
	 * @param page_number
	 *            is the page you wish to extract from
	 * @param breakFragments
	 *            will divide up text based on white space characters
	 * @param punctuation
	 *            is a string containing all values that should be used to divide up words
	 * @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...)
	 * @throws PdfException
	 *             If the co-ordinates are not valid
	 */
	final public List extractTextAsWordlist(int x1, int y1, int x2, int y2, int page_number, boolean breakFragments, String punctuation)
			throws PdfException {

		/** make sure co-ords valid and throw exception if not */
		int[] v = validateCoordinates(x1, y1, x2, y2);
		x1 = v[0];
		y1 = v[1];
		x2 = v[2];
		y2 = v[3];

		/** extract the raw fragments (Note order or parameters passed) */
		if (breakFragments) copyToArrays(x1, y2, x2, y1, true, true, false, punctuation, true);
		else copyToArrays();

		/** delete any hidden text */
		removeEncoding();

		// eliminate shadows and also merge overlapping text
		cleanupShadowsAndDrownedObjects(true);

		int[] items = getsortedUnusedFragments(true, false);
		int count = items.length;

		/**
		 * if no values return null
		 */
		if (count == 0) {
			LogWriter.writeLog("Less than 1 text item on page");

			return null;
		}

		/**
		 * check orientation and get preferred. Items not correct will be ignored
		 */
		int writingMode = getWritingMode(items, count);

		/**
		 * build set of lines from text
		 */
		createLines(count, items, writingMode, true, false, false);

		/**
		 * alter co-ords to rotated if requested
		 */
		float[] f_x1 = null, f_x2 = null, f_y1 = null, f_y2 = null;

		if (useUnrotatedCoords || writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
			f_x1 = this.f_x1;
			f_x2 = this.f_x2;
			f_y1 = this.f_y1;
			f_y2 = this.f_y2;
		}
		else
			if (writingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
				f_x2 = this.f_x1;
				f_x1 = this.f_x2;
				f_y1 = this.f_y1;
				f_y2 = this.f_y2;
			}
			else
				if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
					f_x1 = this.f_y2;
					f_x2 = this.f_y1;
					f_y1 = this.f_x2;
					f_y2 = this.f_x1;

				}
				else
					if (writingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
						f_x1 = this.f_y1;
						f_x2 = this.f_y2;
						f_y2 = this.f_x1;
						f_y1 = this.f_x2;
					}

		/** put into a Vector */
		List values = new ArrayList();

		for (int i = 0; i < this.content.length; i++) {
			if (this.content[i] != null) {

				// System.out.println(">>>>>"+content[i]);

				if ((this.colorExtracted) && (this.isXMLExtraction)) {
					if (!this.content[i].toString().toLowerCase().startsWith(GenericColorSpace.cb)) {
						this.content[i].insert(0, this.f_colorTag[this.master]);
					}
					if (!this.content[i].toString().toLowerCase().endsWith(GenericColorSpace.ce)) {
						this.content[i].append(GenericColorSpace.ce);
					}
				}

				if (this.isXMLExtraction) values.add((this.content[i]).toString());
				else values.add(Strip.convertToText((this.content[i]).toString(), this.isXMLExtraction));

				if ((!useUnrotatedCoords) && (writingMode == PdfData.VERTICAL_TOP_TO_BOTTOM)) {
					values.add(String.valueOf(f_x1[i]));
					values.add(String.valueOf(f_y1[i]));
					values.add(String.valueOf(f_x2[i]));
					values.add(String.valueOf(f_y2[i]));
				}
				else
					if ((!useUnrotatedCoords) && (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP)) {
						values.add(String.valueOf(f_x1[i]));
						values.add(String.valueOf(f_y2[i]));
						values.add(String.valueOf(f_x2[i]));
						values.add(String.valueOf(f_y1[i]));
					}
					else {
						values.add(String.valueOf(f_x1[i]));
						values.add(String.valueOf(f_y1[i]));
						values.add(String.valueOf(f_x2[i]));
						values.add(String.valueOf(f_y2[i]));
					}
			}
		}

		LogWriter.writeLog("Text extraction as wordlist completed");

		return values;
	}

	/**
	 * reset global values
	 */
	private void reset() {

		this.isXHTML = true;
		this.nextSlot = 0;

		this.lineBreaks = new Vector_Int();

		this.max_rows = 0;
		this.master = 0;

		this.colorExtracted = false;
	}

	/**
	 * algorithm to place data from specified coordinates on a page into a String.
	 * 
	 * @param x1
	 *            is the x coord of the top left corner
	 * @param y1
	 *            is the y coord of the top left corner
	 * @param x2
	 *            is the x coord of the bottom right corner
	 * @param y2
	 *            is the y coord of the bottom right corner
	 * @param page_number
	 *            is the page you wish to extract from
	 * @param estimateParagraphs
	 *            will attempt to find paragraphs and add new lines in output if true
	 * @param breakFragments
	 *            will divide up text based on white space characters if true
	 * @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...)
	 * @throws PdfException
	 *             If the co-ordinates are not valid
	 */
	final public String extractTextInRectangle(int x1, int y1, int x2, int y2, int page_number, boolean estimateParagraphs, boolean breakFragments)
			throws PdfException {

		reset();

		if ((breakFragments) && (!this.pdf_data.IsEmbedded())) throw new PdfException(
				"[PDF] Request to breakfragments and width not added. Please add call to init(true) of PdfDecoder to your code.");

		/** make sure co-ords valid and throw exception if not */
		int[] v = validateCoordinates(x1, y1, x2, y2);
		x1 = v[0];
		y1 = v[1];
		x2 = v[2];
		y2 = v[3];

		int master, count;

		/** extract the raw fragments (Note order or parameters passed) */
		if (breakFragments) copyToArrays(x1, y2, x2, y1, (this.isXMLExtraction), false, false, null, false);
		else copyToArrays();

		/**
		 * delete any hidden text
		 */
		removeEncoding();

		/**
		 * eliminate shadows and also merge overlapping text
		 */
		cleanupShadowsAndDrownedObjects(false);

		/** get the fragments as an array */
		int[] items = getsortedUnusedFragments(true, false);
		count = items.length;

		/**
		 * if no values return null
		 */
		if (count == 0) {
			LogWriter.writeLog("Less than 1 text item on page");

			return null;
		}

		/**
		 * check orientation and get preferred. Items not correct will be ignored
		 */
		int writingMode = getWritingMode(items, count);

		/**
		 * build set of lines from text
		 */
		createLines(count, items, writingMode, false, this.isXMLExtraction, false);

		/**
		 * roll lines together
		 */

		master = mergeLinesTogether(writingMode, estimateParagraphs, x1, x2, y1, y2);

		/**
		 * add final deliminators
		 */
		if (this.isXMLExtraction) {
			this.content[master] = new StringBuilder(Fonts.cleanupTokens(this.content[master].toString()));
			this.content[master].insert(0, "");
			this.content[master].append("");
		}

		LogWriter.writeLog("Text extraction completed");

		return cleanup(this.content[master]).toString();
	}

	private StringBuilder cleanup(StringBuilder buffer) {

		if (buffer == null) return buffer;

/**
        if(PdfDecoder.inDemo){
            int icount=buffer.length(),count=0;
            boolean inToken=false;
            for(int i=0;i')
                    inToken=false;
                else if((c!=' ')&&(!inToken)){
                    count++;
                    if(count>4){
                        count=0;
                        buffer.setCharAt(i,'1');
                    }
                }
            }
		}
		/**/

		// sort out & to &
		if (this.isXMLExtraction) {
			String buf = buffer.toString();

			buf = buf.replaceAll("&#", "XX#");
			buf = buf.replaceAll("<", "XXlt");
			buf = buf.replaceAll(">", "XXgt");

			buf = buf.replaceAll("&", "&");

			// put back others
			buf = buf.replaceAll("XX#", "&#");
			buf = buf.replaceAll("XXlt", "<");
			buf = buf.replaceAll("XXgt", ">");

			boolean removeInvalidXMLValues = true;
			if (removeInvalidXMLValues) {

				/**
				 * Restricted Char ::= [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F] [#x1-#x8] | [#x11-#x12] | [#x14-#x31] |
				 * [#x127-#x132] | [#x134-#x159]
				 */

				/** set mappings */
				Map asciiMappings = new HashMap();
				/** [#x1-#x8] */
				for (int i = 1; i <= 8; i++)
					asciiMappings.put("&#" + i + ';', "");

				/** [#x11-#x12] */
				for (int i = 11; i <= 12; i++)
					asciiMappings.put("&#" + i + ';', "");

				/** [#x14-#x31] */
				for (int i = 14; i <= 31; i++)
					asciiMappings.put("&#" + i + ';', "");

				/** [#x127-#x132] */
				// for (int i = 127; i <= 132; i++)
				// asciiMappings.put("&#" + i + ";", "");

				/** [#x134-#x159] */
				// for (int i = 134; i <= 159; i++)
				// asciiMappings.put("&#" + i + ";", "");

				/** substitute illegal XML characters for mapped values */
				for (Object o : asciiMappings.keySet()) {
					String character = (String) o;
					String mappedCharacter = (String) asciiMappings.get(character);

					buf = buf.replace(character, mappedCharacter);
				}
			}
			buffer = new StringBuilder(buf);
		}

		return buffer;
	}

	/**
	 * scan fragments and detect orientation. If multiple, prefer horizontal
	 */
	private int getWritingMode(int[] items, int count) {

		/**
		 * get first value
		 */
		int orientation = this.writingMode[items[0]];

		// exit if first is horizontal
		if (orientation == PdfData.HORIZONTAL_LEFT_TO_RIGHT || orientation == PdfData.HORIZONTAL_RIGHT_TO_LEFT) return orientation;

		/**
		 * scan items looking at orientation - exit if we find horizontal
		 */
		for (int j = 1; j < count; j++) {

			int c = items[j];

			if (!this.isUsed[c]) {

				if (this.writingMode[c] == PdfData.HORIZONTAL_LEFT_TO_RIGHT || this.writingMode[c] == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
					orientation = this.writingMode[c];
					j = count;
					LogWriter.writeLog("Text of multiple orientations found. Only horizontal text used.");
				}
			}
		}

		return orientation;
	}

	/**
	 * @param estimateParagraphs

	 * @throws PdfException
	 */
	private int mergeLinesTogether(int currentWritingMode, boolean estimateParagraphs, int x1, int x2, int y1, int y2) throws PdfException {

		String separator;

		int[] indices;

		// used for working out alignment
		int middlePage;

		/**
		 * create local copies of
		 */
		float[] f_x1, f_x2, f_y1, f_y2;

		if (currentWritingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
			f_x1 = this.f_x1;
			f_x2 = this.f_x2;
			f_y1 = this.f_y1;
			f_y2 = this.f_y2;
			indices = getsortedUnusedFragments(false, true);
			middlePage = (x1 + x2) / 2;
		}
		else
			if (currentWritingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
				f_x2 = this.f_x1;
				f_x1 = this.f_x2;
				f_y1 = this.f_y1;
				f_y2 = this.f_y2;
				indices = getsortedUnusedFragments(false, true);
				middlePage = (x1 + x2) / 2;
			}
			else
				if (currentWritingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
					f_x1 = this.f_y1;
					f_x2 = this.f_y2;
					f_y1 = this.f_x2;
					f_y2 = this.f_x1;
					indices = getsortedUnusedFragments(true, true);

					indices = reverse(indices);
					middlePage = (y1 + y2) / 2;

				}
				else
					if (currentWritingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
						f_x1 = this.f_y2;
						f_x2 = this.f_y1;
						f_y2 = this.f_x2;
						f_y1 = this.f_x1;
						indices = getsortedUnusedFragments(true, true);
						middlePage = (y1 + y2) / 2;
					}
					else {
						throw new PdfException("Illegal value " + currentWritingMode + "for currentWritingMode");
					}
		int quarter = middlePage / 2;
		int count = indices.length;
		int master = indices[count - 1];

		/**
		 * now loop through all lines merging
		 */
		int ClastChar, MlastChar, CFirstChar;
		final boolean debug = false;

		for (int i = count - 2; i > -1; i--) {

			int child = indices[i];
			separator = "";

			/** add formatting in to retain structure */
			// text to see if lasts ends with . and next starts with capital

			// -1 if no chars
			ClastChar = getLastChar(this.content[child]);
			if (debug) {

				CFirstChar = getFirstChar(this.content[child]);
				MlastChar = getLastChar(this.content[master]);

				StringBuilder child_textX = Strip.stripXML(this.content[child], this.isXMLExtraction);
				String master_textX = Strip.stripXML(this.content[master], this.isXMLExtraction).toString();

			}

			if (ClastChar != -1) {

				addAlignmentFormatting(estimateParagraphs, middlePage, f_x1, f_x2, quarter, child);

				// see if we insert a line break and merge
				String lineSpace = "
" + SystemSeparator + "";
				if (this.isXMLExtraction) lineSpace = SystemSeparator;

				float gap = f_y2[master] - f_y1[child];
				float line_height = f_y1[child] - f_y2[child];
				if (currentWritingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
					gap = -gap;
					line_height = -line_height;
				}

				if ((gap > line_height) & (line_height > 0)) { // add in line gaps

					while (gap > line_height) {
						separator = separator + lineSpace;
						gap = gap - line_height;
					}

					if (this.isXMLExtraction) separator = separator + "
" + SystemSeparator + "";
					else separator = SystemSeparator;

				}
				else
					if (estimateParagraphs == true) {

						CFirstChar = getFirstChar(this.content[child]);
						MlastChar = getLastChar(this.content[master]);

						if ((((MlastChar == '.')) || (((MlastChar == '\"')))) && ((CFirstChar >= 'A') && (CFirstChar <= 'Z'))) {
							if (this.isXMLExtraction) separator = "
" + SystemSeparator + "";
							else separator = SystemSeparator;
						}

					}
					else {
						if (this.isXMLExtraction) {
							this.content[child].insert(0, "
" + SystemSeparator + "");
						}
						else this.content[master].append(SystemSeparator);
					}

				merge(master, child, separator, false);

			}
		}
		return master;
	}

	private int getFirstChar(StringBuilder buffer) {

		int i = -1;
		boolean inTag = false;
		int count = buffer.length();
		char openChar = ' ';
		int ptr = 0;

		while (ptr < count) {
			char nextChar = buffer.charAt(ptr);

			if ((!inTag) && ((nextChar == '<') || (this.isXMLExtraction && nextChar == '&'))) {
				inTag = true;
				openChar = nextChar;

				// trap & .... &xx; or other spurious
				if ((openChar == '&')) {
					if ((ptr + 1) == count) {
						i = '&';
						ptr = count;
					}
					else {
						char c = buffer.charAt(ptr + 1);

						if ((c != '#') && (c != 'g') && (c != 'l')) {
							i = '&';
							ptr = count;
						}
					}
				}
			}

			if ((!inTag) && (nextChar != ' ')) {
				i = nextChar;
				ptr = count;
			}

			// allow for valid & in stream
			if ((inTag) && (openChar == '&') && (nextChar == ' ')) {
				i = openChar;
				ptr = count;
			}
			else
				if ((inTag) && ((nextChar == '>') || (this.isXMLExtraction && openChar == '&' && nextChar == ';'))) {

					// put back < or >
					if ((nextChar == ';') && (openChar == '&') && (ptr > 2) & (buffer.charAt(ptr - 1) == 't')) {
						if ((buffer.charAt(ptr - 2) == 'l')) {
							i = '<';
							ptr = count;
						}
						else
							if ((buffer.charAt(ptr - 2) == 'g')) {
								i = '>';
								ptr = count;
							}
					}

					inTag = false;
				}

			ptr++;
		}

		return i;
	}

	/** return char as int or -1 if no match */
	private int getLastChar(StringBuilder buffer) {

		int i = -1;
		boolean inTag = false;
		int count = buffer.length();
		int size = count;
		char openChar = ' ';
		count--; // knock 1 off so points to last char

		while (count > -1) {
			char nextChar = buffer.charAt(count);

			// trap &xx;;
			if (inTag && openChar == ';' && nextChar == ';') {
				i = ';';
				count = -1;
			}

			if (!inTag && (nextChar == '>' || (this.isXMLExtraction && nextChar == ';'))) {
				inTag = true;

				// check it is a token and not just > at end
				int lastTokenStart = buffer.lastIndexOf("') {
							inTag = false;
							ptr = count;
						}
					}
				}

				if (inTag) openChar = nextChar;
				else {
					i = nextChar;
					count = -1;
				}
			}

			if (!inTag && nextChar != 32) {
				i = nextChar;
				count = -1;
			}

			if (nextChar == '<' || (this.isXMLExtraction && openChar == ';' && nextChar == '&')) {
				inTag = false;

				// put back < or >
				if ((nextChar == '&') && (count + 3 < size) & (buffer.charAt(count + 2) == 't') && (buffer.charAt(count + 3) == ';')) {
					if ((buffer.charAt(count + 1) == 'l')) {
						i = '<';
						count = -1;
					}
					else
						if ((buffer.charAt(count + 1) == 'g')) {
							i = '>';
							count = -1;
						}
				}
			}

			if (inTag && openChar == ';' && nextChar == ' ') {
				count = -1;
				i = ';';
			}
			count--;
		}

		return i;
	}

	/**
	 * reverse order in matrix so back to front
	 */
	private static int[] reverse(int[] indices) {
		int count = indices.length;
		int[] newIndex = new int[count];
		for (int i = 0; i < count; i++) {
			newIndex[i] = indices[count - i - 1];
		}
		return newIndex;
	}

	/**
	 * used to add LEFT,CENTER,RIGHT tags into XML when extracting text
	 */
	private void addAlignmentFormatting(boolean estimateParagraphs, int middlePage, float[] f_x1, float[] f_x2, int quarter, int child) {
		// put in some alignment
		float left_gap = middlePage - f_x1[child];
		float right_gap = f_x2[child] - middlePage;
		if ((!estimateParagraphs) && (this.isXMLExtraction) && (left_gap > 0) && (right_gap > 0) && (f_x1[child] > quarter)
				&& (f_x1[child] < (middlePage + quarter))) {

			float ratio = left_gap / right_gap;
			if (ratio > 1) ratio = 1 / ratio;

			if (ratio > 0.95) { // add centring if seems centered around middle
				this.content[child] = new StringBuilder(Fonts.cleanupTokens(this.content[child].toString()));
				this.content[child].insert(0, "
");
				this.content[child].append("\n");
			}
			else
				if ((right_gap < 10) & (left_gap > 30)) { // add right align
					this.content[child] = new StringBuilder(Fonts.cleanupTokens(this.content[child].toString()));
					this.content[child].insert(0, "");
					this.content[child].append("\n");

				}
		}
	}

	/**
	 * convert fragments into lines of text
	 */
	/**
	 * convert fragments into lines of text
	 */
	private void createLines(int count, int[] items, int mode, boolean breakOnSpace, boolean addMultiplespaceXMLTag, boolean sameLineOnly)
			throws PdfException {

		String separator;

		final boolean debug = false;

		/**
		 * create local copies of arrays
		 */
		float[] f_x1, f_x2, f_y1, f_y2;

		/**
		 * reverse order if text right to left
		 */
		if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) items = reverse(items);

		/**
		 * set pointers so left to right text
		 */
		if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
			f_x1 = this.f_x1;
			f_x2 = this.f_x2;
			f_y1 = this.f_y1;
			f_y2 = this.f_y2;
		}
		else
			if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
				f_x2 = this.f_x1;
				f_x1 = this.f_x2;
				f_y1 = this.f_y1;
				f_y2 = this.f_y2;
			}
			else
				if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
					f_x1 = this.f_y1;
					f_x2 = this.f_y2;
					f_y1 = this.f_x2;
					f_y2 = this.f_x1;
				}
				else
					if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
						f_x1 = this.f_y2;
						f_x2 = this.f_y1;
						f_y2 = this.f_x1;
						f_y1 = this.f_x2;
					}
					else {
						throw new PdfException("Illegal value " + mode + "for currentWritingMode");
					}

		/**
		 * scan items joining best fit to right of each fragment to build lines. This is tedious and processor intensive but necessary as the order
		 * cannot be guaranteed
		 */
		for (int j = 0; j < count; j++) {

			int id = -1, i;
			int c = items[j];

			float smallest_gap = -1, gap, yMidPt;
			if (!this.isUsed[c] && this.writingMode[c] == mode) {

				if (debug) System.out.println("Look for match with " + removeHiddenMarkers(this.content[c].toString()));

				while (true) {
					for (int j2 = 0; j2 < count; j2++) {
						i = items[j2];

						if (this.isUsed[i] == false) {

							// amount of variation in bottom of text
							int baseLineDifference = (int) (f_y2[i] - f_y2[c]);
							if (baseLineDifference < 0) baseLineDifference = -baseLineDifference;

							// amount of variation in bottom of text
							int topLineDifference = (int) (f_y1[i] - f_y1[c]);
							if (topLineDifference < 0) topLineDifference = -topLineDifference;

							// line gap
							int lineGap = (int) (f_x1[i] - f_x2[c]);

							// Check if fragments are closer from the other end
							if (lineGap > (int) (f_x1[c] - f_x2[i])) lineGap = (int) (f_x1[c] - f_x2[i]);

							int fontSizeChange = this.fontSize[c] - this.fontSize[i];
							if (fontSizeChange < 0) fontSizeChange = -fontSizeChange;

							if (debug) System.out.println("Against " + removeHiddenMarkers(this.content[i].toString()));

							if (sameLineOnly && lineGap > this.fontSize[c] && lineGap > 0) { // ignore text in wrong order allowing slight margin for
																								// error
								// allow for multicolumns with gap

								if (debug) System.out.println("case1 lineGap=" + lineGap);
								// //Case removed as it broke one file and had no effect on other files
								// }else if (sameLineOnly && (lineGap > (fontSize[c]*10)|| lineGap > (fontSize[i]*10)) ) { //JUMP IN TEXT SIZE ACROSS
								// COL
								// //ignore
								//
								// if(debug)
								// System.out.println("case2");
							}
							else
								if (sameLineOnly && baseLineDifference > 1 && lineGap > 2 * this.fontSize[c]
										&& (this.fontSize[c] == this.fontSize[i])) { // TEXT SLIGHTLY OFFSET
									// ignore
									if (debug) System.out.println("case3");
								}
								else
									if (sameLineOnly && baseLineDifference > 3) {
										// ignore
										if (debug) System.out.println("case4");
									}
									else
										if (sameLineOnly && fontSizeChange > 2) {
											// ignore
											if (debug) System.out.println("case5");
										}
										else
											if (i != c
													&& ((f_x1[i] > f_x1[c] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM) || f_x1[i] < f_x1[c]
															&& mode == PdfData.VERTICAL_TOP_TO_BOTTOM && this.writingMode[c] == mode
															&& (!(fontSizeChange > 2) || (fontSizeChange > 2 && topLineDifference < 3)))) { // see if
																																			// on
																																			// right

												gap = (f_x1[i] - f_x2[c]);

												if (debug) System.out.println("case6 gap=" + gap);

												if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) gap = -gap;

												// allow for fp error
												if ((gap < 0) && (gap > -2)) gap = 0;

												// make sure on right
												yMidPt = (f_y1[i] + f_y2[i]) / 2;

												// see if line & if only or better fit
												if ((yMidPt < f_y1[c]) && (yMidPt > f_y2[c]) && ((smallest_gap < 0) || (gap < smallest_gap))) {
													smallest_gap = gap;
													id = i;
												}
											}
						}
					}

					// merge on next right item or exit when no more matches
					if (id == -1) break;

					float possSpace = f_x1[id] - f_x2[c];
					if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) possSpace = -possSpace;
					else
						if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) possSpace = (f_x2[id] - f_x1[c]);

					// add space if gap between this and last object
					separator = isGapASpace(c, id, possSpace, addMultiplespaceXMLTag, mode);

					/** merge if adjoin */
					if ((breakOnSpace) && (this.hadSpace != null) && ((this.hadSpace[c]) || (separator.startsWith(" ")))) break;

					merge(c, id, separator, true);

					id = -1; // reset
					smallest_gap = 1000000; // and reset the gap

				}
			}
		}
	}

	static class ResultsComparator implements Comparator {
		private int rotation;

		public ResultsComparator(int rotation) {
			this.rotation = rotation;
		}

		@Override
		public int compare(Object o1, Object o2) {
			Rectangle[] ra1;
			Rectangle[] ra2;

			if (o1 instanceof Rectangle[]) {
				ra1 = (Rectangle[]) o1;
			}
			else ra1 = new Rectangle[] { (Rectangle) o1 };

			if (o2 instanceof Rectangle[]) {
				ra2 = (Rectangle[]) o2;
			}
			else ra2 = new Rectangle[] { (Rectangle) o2 };

			for (int i = 0; i != ra1.length; i++)
				for (int j = 0; j != ra2.length; j++) { // do we need this loop?
					Rectangle r1 = ra1[i];
					Rectangle r2 = ra2[j];

					switch (this.rotation) {
						case 0:
							if (r1.y == r2.y) { // the two words on on the same level so pick the one on the left
								if (r1.x > r2.x) return 1;
								else return -1;
							}
							else
								if (r1.y > r2.y) { // the first word is above the second, so pick the first
									return -1;
								}

							return 1;// the second word is above the first, so pick the second

						case 90:
							if (r1.x == r2.x) { // the two words on on the same level so pick the one on the left
								if (r1.y > r2.y) return 1;
								else return -1;
							}
							else
								if (r1.x > r2.x) // the first word is above the second, so pick the first
								return 1;

							return -1; // the second word is above the first, so pick the second

						case 180:
							if (r1.y == r2.y) { // the two words on on the same level so pick the one on the left
								if (r1.x > r2.x) return 1;
								else return -1;
							}
							else
								if (r1.y > r2.y) { // the first word is above the second, so pick the first
									return -1;
								}

							return 1;// the second word is above the first, so pick the second

						case 270:
							if (r1.x == r2.x) { // the two words on on the same level so pick the one on the left
								if (r1.y > r2.y) return 1;
								else return -1;
							}
							else
								if (r1.x < r2.x) // the first word is above the second, so pick the first
								return 1;

							return -1; // the second word is above the first, so pick the second
					}

					// Orginal code kept incase of mistake.
					// if (rotation == 0 || rotation == 180) {
					// if (r1.y == r2.y) { // the two words on on the same level so pick the one on the left
					// if (r1.x > r2.x)
					// return 1;
					// else
					// return -1;
					// } else if (r1.y > r2.y) { // the first word is above the second, so pick the first
					// return -1;
					// }
					//
					// return 1; // the second word is above the first, so pick the second
					// }
					// else { // rotation == 90 or 270
					// if (r1.x == r2.x) { // the two words on on the same level so pick the one on the left
					// if (r1.y > r2.y)
					// return 1;
					// else
					// return -1;
					// } else if (r1.x > r2.x) // the first word is above the second, so pick the first
					// return 1;
					//
					// return -1; // the second word is above the first, so pick the second
					// }
				}
			return -1; // the second word is above the first, so pick the second
		}
	}

	// 
	/**
	 * Algorithm to find multiple text terms in x1,y1,x2,y2 rectangle on page_number, with matching teaser
	 * 
	 * @param x1
	 *            the left x cord
	 * @param y1
	 *            the upper y cord
	 * @param x2
	 *            the right x cord
	 * @param y2
	 *            the lower y cord
	 * @param rotation
	 *            the rotation of the page to be searched
	 * @param page_number
	 *            the page number to search on
	 * @param terms
	 *            the terms to search for
	 * @param searchType
	 *            searchType the search type made up from one or more constants obtained from the SearchType class
	 * @param listener
	 *            an implementation of SearchListener is required, this is to enable searching to be cancelled
	 * @return a SortedMap containing a collection of Rectangle describing the location of found text, mapped to a String which is the matching teaser
	 * @throws PdfException
	 *             If the co-ordinates are not valid
	 */
	public SortedMap findMultipleTermsInRectangleWithMatchingTeasers(int x1, int y1, int x2, int y2, final int rotation, int page_number,
			String[] terms, int searchType, SearchListener listener) throws PdfException {

		this.usingMultipleTerms = true;
		this.multipleTermTeasers.clear();
		this.teasers = null;

		boolean origIncludeTease = this.includeTease;

		this.includeTease = true;

		List highlights = findMultipleTermsInRectangle(x1, y1, x2, y2, page_number, terms, searchType, listener);

		SortedMap highlightsWithTeasers = new TreeMap(new ResultsComparator(rotation));

		for (int i = 0; i < highlights.size(); i++) {

			/* highlights.get(i) is a rectangle or a rectangle[] */
			highlightsWithTeasers.put(highlights.get(i), this.multipleTermTeasers.get(i));
		}

		this.usingMultipleTerms = false;

		this.includeTease = origIncludeTease;

		return highlightsWithTeasers;
	}

	// 
	/**
	 * Algorithm to find multiple text terms in x1,y1,x2,y2 rectangle on page_number.
	 * 
	 * @param x1
	 *            the left x cord
	 * @param y1
	 *            the upper y cord
	 * @param x2
	 *            the right x cord
	 * @param y2
	 *            the lower y cord
	 * @param rotation
	 *            the rotation of the page to be searched
	 * @param page_number
	 *            the page number to search on
	 * @param terms
	 *            the terms to search for
	 * @param orderResults
	 *            if true the list that is returned is ordered to return the resulting rectangles in a logical order descending down the page, if
	 *            false, rectangles for multiple terms are grouped together.
	 * @param searchType
	 *            searchType the search type made up from one or more constants obtained from the SearchType class
	 * @param listener
	 *            an implementation of SearchListener is required, this is to enable searching to be cancelled
	 * @return a list of Rectangle describing the location of found text
	 * @throws PdfException
	 *             If the co-ordinates are not valid
	 */
	public List findMultipleTermsInRectangle(int x1, int y1, int x2, int y2, final int rotation, int page_number, String[] terms,
			boolean orderResults, int searchType, SearchListener listener) throws PdfException {

		this.usingMultipleTerms = true;
		this.multipleTermTeasers.clear();
		this.teasers = null;

		List highlights = findMultipleTermsInRectangle(x1, y1, x2, y2, page_number, terms, searchType, listener);

		if (orderResults) {
			Collections.sort(highlights, new ResultsComparator(rotation));
		}

		this.usingMultipleTerms = false;

		return highlights;
	}

	private List findMultipleTermsInRectangle(int x1, int y1, int x2, int y2, int page_number, String[] terms, int searchType, SearchListener listener)
			throws PdfException {

		List list = new ArrayList();

		for (String term : terms) {
			if (listener != null && listener.isCanceled()) {
				// System.out.println("RETURNING EARLY");
				break;
			}

			float[] co_ords;

			co_ords = findText(new Rectangle(x1, y1, x2, y2), page_number, new String[] { term }, searchType);

			if (co_ords != null) {
				int count = co_ords.length;
				for (int ii = 0; ii < count; ii = ii + 5) {

					int wx1 = (int) co_ords[ii];
					int wy1 = (int) co_ords[ii + 1];
					int wx2 = (int) co_ords[ii + 2];
					int wy2 = (int) co_ords[ii + 3];

					Rectangle rectangle = new Rectangle(wx1, wy2, wx2 - wx1, wy1 - wy2);

					int seperator = (int) co_ords[ii + 4];

					if (seperator == this.linkedSearchAreas) {
						Vector_Rectangle vr = new Vector_Rectangle();
						vr.addElement(rectangle);
						while (seperator == this.linkedSearchAreas) {
							ii = ii + 5;
							wx1 = (int) co_ords[ii];
							wy1 = (int) co_ords[ii + 1];
							wx2 = (int) co_ords[ii + 2];
							wy2 = (int) co_ords[ii + 3];
							seperator = (int) co_ords[ii + 4];
							rectangle = new Rectangle(wx1, wy2, wx2 - wx1, wy1 - wy2);
							vr.addElement(rectangle);
						}
						vr.trim();
						list.add(vr.get());
					}
					else {
						list.add(rectangle);
					}
				}
			}
		}
		return list;
	}

	// 
	/**
	 * Method to find text in the specified area allowing for the text to be split across multiple lines.

	 * 
	 * @param searchArea
	 *            = Area on page to search. If null search whole page
	 * @param page_number
	 *            = the current page to search
	 * @param terms
	 *            = the text to search for
	 * @param searchType
	 *            = info on how to search the pdf
	 * @return the coords of the found text in a float[] where the coords are pdf page coords. The origin of the coords is the bottom left hand corner
	 *         (on unrotated page) organised in the following order.
 [0]=result x1 coord
 [1]=result y1 coord
 [2]=result x2 coord

	 *         [3]=result y2 coord
 [4]=either -101 to show that the next text area is the remainder of this word on another line else any other
	 *         value is ignored.

	 * @throws PdfException
	 */
	final public float[] findText(Rectangle searchArea, int page_number, String[] terms, int searchType) throws PdfException {

		// Failed to supply search terms to do nothing
		if (terms == null) return new float[] {};

		// Flags to control the different search options
		boolean firstOccuranceOnly = false;
		boolean wholeWordsOnly = false;
		boolean foundFirst = false;
		boolean useRegEx = false;

		// Search result and teaser holders
		Vector_Float resultCoords = new Vector_Float(0);
		Vector_String resultTeasers = new Vector_String(0);

		// Extract the text data into local arrays for searching
		copyToArrays();

		// Remove any hidden text on page as should not be found
		cleanupShadowsAndDrownedObjects(false);

		// Get unused text objects and sort them for correct searching
		int[] items = getsortedUnusedFragments(true, false);

		/**
		 * check orientation and get preferred. Items not correct will be ignored
		 */
		int l2r = 0;
		int r2l = 0;
		int t2b = 0;
		int b2t = 0;

		for (int i = 0; i != items.length; i++) {
			switch (this.writingMode[items[i]]) {
				case 0:
					l2r++;
					break;
				case 1:
					r2l++;
					break;
				case 2:
					t2b++;
					break;
				case 3:
					b2t++;
					break;
			}
		}

		int[] unsorted = new int[] { l2r, r2l, t2b, b2t };
		int[] sorted = new int[] { l2r, r2l, t2b, b2t };

		// Set all to -1 so we can tell if it's been set yet
		int[] writingModes = new int[] { -1, -1, -1, -1 };

		Arrays.sort(sorted);

		for (int i = 0; i != unsorted.length; i++) {
			for (int j = 0; j < sorted.length; j++) {
				if (unsorted[i] == sorted[j]) {

					int pos = j - 3;
					if (pos < 0) pos = -pos;

					if (writingModes[pos] == -1) {
						writingModes[pos] = i;
						j = sorted.length;
					}
				}
			}
		}

		for (int u = 0; u != writingModes.length; u++) {

			int writingMode = writingModes[u];

			// if not lines for writing mode, ignore
			if (unsorted[writingMode] != 0) {

				// Merge text fragments into lines as displayed on page
				createLines(items.length, items, writingMode, true, false, true);

				// Bitwise flags for regular expressions engine, options always required
				int options = 0;

				// Turn on case sensitive mode
				if ((searchType & SearchType.CASE_SENSITIVE) != SearchType.CASE_SENSITIVE) {
					options = (options | Pattern.CASE_INSENSITIVE);
				}

				// Only find first occurance of each search term
				if ((searchType & SearchType.FIND_FIRST_OCCURANCE_ONLY) == SearchType.FIND_FIRST_OCCURANCE_ONLY) {
					firstOccuranceOnly = true;
				}

				// Only find whole words, not partial words
				if ((searchType & SearchType.WHOLE_WORDS_ONLY) == SearchType.WHOLE_WORDS_ONLY) {
					wholeWordsOnly = true;
				}

				// Allow search to find split line results
				if ((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS) {
					options = (options | Pattern.MULTILINE | Pattern.DOTALL);
				}

				// Allow the use of regular expressions symbols
				if ((searchType & SearchType.USE_REGULAR_EXPRESSIONS) == SearchType.USE_REGULAR_EXPRESSIONS) {
					useRegEx = true;
				}

				/**
				 * create local copies of arrays
				 */
				float[] f_y1 = this.f_y1, f_y2 = this.f_y2;

				/**
				 * swap around x and y so rountine works on all cases
				 */
				boolean valuesSwapped = false;
				if (writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
					f_y1 = this.f_y1;
					f_y2 = this.f_y2;
				}
				else
					if (writingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
						f_y1 = this.f_y1;
						f_y2 = this.f_y2;
					}
					else
						if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
							f_y1 = this.f_x2;
							f_y2 = this.f_x1;
							valuesSwapped = true;
						}
						else
							if (writingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
								f_y2 = this.f_x1;
								f_y1 = this.f_x2;
								valuesSwapped = true;
							}

				// Portions of text to perform the search on and find teasers
				String[] searchText;
				String[] coordsText;

				// Merge all text into one with \n line separators
				// This will allow checking for multi line split results
				String plain = "";
				String raw = "";
				for (int i = 0; i != this.content.length; i++) {
					if (this.content[i] != null && writingMode == this.writingMode[i]) {

						raw += this.content[i] + "\n";
						plain += this.content[i] + "\n";
					}
				}

				// Remove double spaces, replacing them with single spaces
				raw = removeDuplicateSpaces(raw);
				plain = removeDuplicateSpaces(plain);

				// Strip xml from content and keep coords and text data
				raw = Strip.stripXML(raw, this.isXMLExtraction).toString();

				// Strip xml and coords data from content and keep text data
				plain = removeHiddenMarkers(plain);
				plain = Strip.stripXML(plain, this.isXMLExtraction).toString();

				// Store text in the search and teaser arrays
				searchText = new String[] { plain };
				coordsText = new String[] { raw };

				// Hold starting point data at page rotation
				Point resultStart;

				// Work through the search terms one at a time
				for (int j = 0; j != terms.length; j++) {

					String searchValue = terms[j];

					// Set the default separator between words in a search term
					String sep = " ";

					// Multiline needs space or newline to be recognised as word separators
					if ((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS) {
						sep = "[ \\\\n]";
					}

					// if not using reg ex add reg ex literal flags around the text and word separators
					if (!useRegEx) {
						searchValue = "\\Q" + searchValue + "\\E";
						sep = "\\\\E" + sep + "\\\\Q";
					}

					// If word seperator has changed, replace all spaces with modified seperator
					if (!sep.equals(" ")) {
						searchValue = searchValue.replaceAll(" ", sep);
					}

					// Surround search term with word boundry tags to match whole words
					if (wholeWordsOnly) searchValue = "\\b" + searchValue + "\\b";

					// Create pattern to match search term
					Pattern searchTerm = Pattern.compile(searchValue, options);

					// Create pattern to match search term with two words before and after
					Pattern teaserTerm = Pattern.compile("(?:\\S+\\s)?\\S*(?:\\S+\\s)?\\S*" + searchValue + "\\S*(?:\\s\\S+)?\\S*(?:\\s\\S+)?",
							options);

					// Loop through all search text
					for (int i = 0; i != searchText.length; i++) {

						// Get text data and text+coord data
						String plainText = searchText[i];
						String coordText = coordsText[i];

						// So long as text data is not null
						if (plainText != null) {

							// Create two matchers for finding search term and teaser
							Matcher termFinder = searchTerm.matcher(plainText);
							Matcher teaserFinder = teaserTerm.matcher(plainText);
							boolean needToFindTeaser = true;

							// Keep looping till no result is returned
							while (termFinder.find()) {
								resultStart = null;
								// Make note of the text found and index in the text
								String foundTerm = termFinder.group();
								int termStarts = termFinder.start();
								int termEnds = termFinder.end() - 1;

								// If storing teasers
								if (this.includeTease) {

									// Store the term found as a default value
									String teaser = foundTerm;

									if (this.includeHTMLtags) teaser = "" + teaser + "";

									boolean itemFound = false;
									if (needToFindTeaser) {
										itemFound = teaserFinder.find();
									}

									if (itemFound) {
										// Get a teaser if found and set the search term to bold is allowed
										if (teaserFinder.start() < termStarts && teaserFinder.end() > termEnds) {

											// replace default with found teaser
											teaser = teaserFinder.group();

											if (this.includeHTMLtags) {
												// Calculate points to add bold tags
												int teaseStarts = termStarts - teaserFinder.start();
												int teaseEnds = (termEnds - teaserFinder.start()) + 1;

												// Add bold tags
												teaser = teaser.substring(0, teaseStarts) + "" + teaser.substring(teaseStarts, teaseEnds) + ""
														+ teaser.substring(teaseEnds, teaser.length());
											}
											needToFindTeaser = true;
										}
										else {
											needToFindTeaser = false;
										}
									}

									// Store teaser
									resultTeasers.addElement(teaser);
								}

								// Get coords of found text for highlights
								float currentX;
								float width;

								// Track point in text data line (without coord data)
								int pointInLine = -1;

								// Track line on page
								int lineCounter = 0;

								// Skip null values and value not in the correct writing mode to ensure correct result coords
								while (this.content[lineCounter] == null || writingMode != this.writingMode[lineCounter])
									lineCounter++;

								// Flags used to catch if result is split accross lines
								boolean startFound = false;
								boolean endFound = false;

								// Cycle through coord text looking for coords of this result
								// Ignore first value as it is known to be the first marker
								for (int pointer = 1; pointer < coordText.length(); pointer++) {

									// find second marker and get x coord
									int startPointer = pointer;
									while (pointer < coordText.length()) {
										if (coordText.charAt(pointer) == MARKER2) break;
										pointer++;
									}

									// Convert text to float value for x coord
									currentX = Float.parseFloat(coordText.substring(startPointer, pointer));
									pointer++;

									// find third marker and get width
									startPointer = pointer;
									while (pointer < coordText.length()) {
										if (coordText.charAt(pointer) == MARKER2) break;

										pointer++;
									}

									// Convert text to float value for character width
									width = Float.parseFloat(coordText.substring(startPointer, pointer));
									pointer++;

									// find fourth marker and get text (character)
									startPointer = pointer;
									while (pointer < coordText.length()) {
										if (coordText.charAt(pointer) == MARKER2) break;

										pointer++;
									}

									// Store text to check for newline character later
									String text = coordText.substring(startPointer, pointer);
									pointInLine += text.length();

									// Start of term not found yet.
									// Point in line is equal to or greater than start of the term.
									// Store coords and mark start as found.
									if (!startFound && pointInLine >= termStarts) {
										resultStart = new Point((int) currentX, (int) f_y1[lineCounter]);
										startFound = true;
									}

									// End of term not found yet.
									// Point in line is equal to or greater than end of the term.
									// Store coords and mark end as found.
									if (!endFound && pointInLine >= termEnds) {
										if (valuesSwapped) {
											if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
												resultCoords.addElement((int) f_y2[lineCounter]);
												resultCoords.addElement((int) currentX + width);
												resultCoords.addElement(resultStart.y);
												resultCoords.addElement(resultStart.x);
												resultCoords.addElement(0.0f);
											}
											else {
												resultCoords.addElement((int) f_y2[lineCounter]);
												resultCoords.addElement(resultStart.x);
												resultCoords.addElement(resultStart.y);
												resultCoords.addElement((int) currentX + width);
												resultCoords.addElement(0.0f);
											}
										}
										else {
											resultCoords.addElement(resultStart.x);
											resultCoords.addElement(resultStart.y);
											resultCoords.addElement(currentX + width);
											resultCoords.addElement(f_y2[lineCounter]);
											resultCoords.addElement(0.0f);
										}

										endFound = true;
									}

									// Using multi line option.
									// Start of term found.
									// End of term not found.
									// New line character found.
									// Set up multi line result.
									if (startFound && !endFound && text.contains("\n")) {
										// Set ends coords
										if (valuesSwapped) {
											if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
												resultCoords.addElement((int) f_y2[lineCounter]);
												resultCoords.addElement((int) currentX + width);
												resultCoords.addElement(resultStart.y);
												resultCoords.addElement(resultStart.x);
												resultCoords.addElement(this.linkedSearchAreas); // Mark next result as linked

											}
											else {
												resultCoords.addElement((int) f_y2[lineCounter]);
												resultCoords.addElement(resultStart.x);
												resultCoords.addElement(resultStart.y);
												resultCoords.addElement((int) currentX + width);
												resultCoords.addElement(this.linkedSearchAreas); // Mark next result as linked

											}
										}
										else {
											resultCoords.addElement(resultStart.x);
											resultCoords.addElement(resultStart.y);
											resultCoords.addElement(currentX + width);
											resultCoords.addElement(f_y2[lineCounter]);
											resultCoords.addElement(this.linkedSearchAreas); // Mark next result as linked
										}
										// Set start of term as not found
										startFound = false;

										// Set this point in line as start of next term
										// Guarantees next character is found as
										// start of the next part of the search term
										termStarts = pointInLine;
									}

									// In multiline mode we progress the line number when we find a \n
									// This is to allow the correct calculation of y coords
									if (text.contains("\n")) {
										lineCounter++;

										// If current content pointed at is null or not the correct writing mode, skip value until data is found
										while (lineCounter < this.content.length
												&& (this.content[lineCounter] == null || writingMode != this.writingMode[lineCounter])) {
											lineCounter++;
										}
									}

								}

								// If only finding first occurance,
								// Stop searching this text data for search term.
								if (firstOccuranceOnly) {
									foundFirst = true;
									break;
								}
							}

							// If only finding first occurance and first is found,
							// Stop searching all text data for this search term.
							if (firstOccuranceOnly && foundFirst) {
								break;
							}
						}
					}

				}

				// Remove any trailing empty values
				resultCoords.trim();

				// If including tease values
				if (this.includeTease) {

					// Remove any trailing empty values
					resultTeasers.trim();

					// Store teasers so they can be retrieved by different search methods
					if (this.usingMultipleTerms) {
						// Store all teasers for so they may be returned as a sorted map
						// Only used for one method controled by the above flag
						for (int i = 0; i != resultTeasers.size(); i++)
							this.multipleTermTeasers.add(resultTeasers.elementAt(i));
					}
					else {
						// Store all teasers to be retrieved by getTeaser() method
						this.teasers = resultTeasers.get();
					}
				}
			}
		}
		// Return coord data for search results
		return resultCoords.get();
	}

	private static String removeDuplicateSpaces(String textValue) {

		if (textValue.contains("  ")) {

			textValue = textValue.replace("  ", " ");

		}
		return textValue;
	}

	/** return endpoints from last findtext */
	public float[] getEndPoints() {
		return this.endPoints;
	}

	/**
	 * return text teasers from findtext if generateTeasers() called before find
	 */
	public String[] getTeasers() {
		return this.teasers;
	}

	/**
	 * tell find text to generate teasers as well
	 */
	public void generateTeasers() {

		this.includeTease = true;
	}
}