All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jpedal.grouping.PdfGroupingAlgorithms Maven / Gradle / Ivy

The newest version!
/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/java-pdf-library-support/
 *
 * (C) Copyright 1997-2013, IDRsolutions and Contributors.
 *
 * 	This file is part of JPedal
 *
     This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


 *
 * ---------------
 * PdfGroupingAlgorithms.java
 * ---------------
 */
package org.jpedal.grouping;

import java.awt.Point;
import java.awt.Rectangle;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jpedal.color.GenericColorSpace;
import org.jpedal.exception.PdfException;
import org.jpedal.objects.PdfData;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.Fonts;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Sorts;
import org.jpedal.utils.Strip;
import org.jpedal.utils.repositories.Vector_Float;
import org.jpedal.utils.repositories.Vector_Int;
import org.jpedal.utils.repositories.Vector_Object;
import org.jpedal.utils.repositories.Vector_Rectangle;
import org.jpedal.utils.repositories.Vector_String;

/**
 * Applies heuristics to unstructured PDF text to create content
 */
public class PdfGroupingAlgorithms {

	private boolean includeHTMLtags = false;

	public static final int USER_DEFINED_LIST_ONLY = 0;
	public static final int SURROUND_BY_ANY_PUNCTUATION = 1;

	private static String SystemSeparator = System.getProperty("line.separator");

	// public PdfGroupingAlgorithms() {}

	/** ==============START OF ARRAYS================ */
	/**
	 * content is stored in a set of arrays. We have tried various methods (ie create composite object, etc) and none are entirely satisfactory. The
	 * beauty of this method is speed.
	 */

	/**
	 * flag to show this item has been merged into another and should be ignored. This allows us to repeat operations on live elements without lots of
	 * deleting.
	 */
	private boolean[] isUsed;

	/** co-ords of object (x1,y1 is top left) */
	private float[] f_x1, f_x2, f_y1, f_y2;

	/** track if we removed space from end */
	private boolean[] hadSpace;

	/** hold colour info */
	private String[] f_colorTag;

	/** hold writing mode */
	private int[] writingMode;

	/** hold move type */
	private int[] moveType;

	/** font sizes in pixels */
	private int[] fontSize;

	/** amount of space a space uses in this font/size */
	private float[] spaceWidth;

	/** actual text */
	private StringBuilder[] content;

	/** raw number of text characters */
	private int[] textLength;

	/** ==============END OF ARRAYS================ */

	/**
	 * handle on page data object. We extract data from this into local arrays and return grouped content into object at end. This is done for speed.
	 */
	private PdfData pdf_data;

	PdfPageData pageData;

	/** flag to show if output for table is CSV or XHTML */
	private boolean isXHTML = true;

	/** slot to insert next value - used when we split fragments for table code */
	private int nextSlot;

	/** vertical breaks for table calculation */
	private Vector_Int lineBreaks = new Vector_Int();

	/** holds details as we scan lines for table */
	private Vector_Object lines;

	/** lookup table used to sort into correct order for table */
	private Vector_Int lineY2;

	/**
	 * marker char used in content (we bury location for each char so we can split)
	 */
	private static final String MARKER = PdfData.marker;
	public static char MARKER2 = MARKER.charAt(0);

	/** counters for cols and rows and pointer to final object we merge into */
	private int max_rows = 0, master = 0;

	/** flag to show color info is being extracted */
	private boolean colorExtracted = false;

	/** used to calculate correct order for table lines */
	private int[] line_order;

	/** amount we resize arrays holding content with if no space */
	private final static int increment = 100;

	public static boolean useUnrotatedCoords;

	/** end points if text located */
	private float[] endPoints;

	/** flag to show if tease created on findText */
	private boolean includeTease;

	/** teasers for findtext */
	private String[] teasers;

	private List multipleTermTeasers = new ArrayList();

	private boolean usingMultipleTerms = false;

	private boolean isXMLExtraction = true;

	/*
	 * Variables to allow cross line search results
	 */
	/** Value placed between result areas to show they are part of the same result */
	private int linkedSearchAreas = -101;

	/** create a new instance, passing in raw data */
	public PdfGroupingAlgorithms(PdfData pdf_data, PdfPageData pageData, boolean isXMLExtraction) {
		this.pdf_data = pdf_data;
		this.pageData = pageData;
		this.isXMLExtraction = isXMLExtraction;
		this.colorExtracted = pdf_data.isColorExtracted();
	}

	public static void setSeparator(String sep) {
		SystemSeparator = sep;
	}

	/**
	 * workout if we should use space, CR or no separator when joining lines
	 */
	static final private String getLineDownSeparator(StringBuilder rawLine1, StringBuilder rawLine2, boolean isXMLExtraction) {

		String returnValue = " "; // space is default

		boolean hasUnderline = false;

		/** get 2 lines without any XML or spaces so we can look at last char */
		StringBuilder line1, line2;
		if (isXMLExtraction) {
			line1 = Strip.stripXML(rawLine1, isXMLExtraction);
			line2 = Strip.stripXML(rawLine2, isXMLExtraction);
		}
		else {
			line1 = Strip.trim(rawLine1);
			line2 = Strip.trim(rawLine2);
		}

		/** get lengths and if appropriate perform tests */
		int line1Len = line1.length();
		int line2Len = line2.length();
		// System.out.println(line1Len+" "+line2Len);
		if ((line1Len > 1) && (line2Len > 1)) {

			/** get chars to test */
			char line1Char2 = line1.charAt(line1Len - 1);
			char line1Char1 = line1.charAt(line1Len - 2);
			char line2Char1 = line2.charAt(0);
			char line2Char2 = line2.charAt(1);

			// deal with hyphenation first - ignore unless :- or space-
			String hyphen_values = "";
			if (hyphen_values.indexOf(line1Char2) != -1) {
				returnValue = ""; // default of nothing
				if (line1Char1 == ':') returnValue = "\n";
				if (line1Char2 == ' ') returnValue = " ";

				// paragraph breaks if full stop and next line has ascii char or Capital Letter
			}
			else
				if (((line1Char1 == '.') | (line1Char2 == '.'))
						& (Character.isUpperCase(line2Char1) | (line2Char1 == '&') | Character.isUpperCase(line2Char2) | (line2Char2 == '&'))) {
					if (isXMLExtraction) returnValue = "

\n"; else returnValue = "\n"; } } // add an underline if appropriate if (hasUnderline) { if (isXMLExtraction) returnValue = returnValue + "

\n"; else returnValue = returnValue + '\n'; } return returnValue; } /** * remove shadows from text created by double printing of text and drowned items where text inside other text */ private final void cleanupShadowsAndDrownedObjects(boolean avoidSpaces) { // get list of items int[] items = getUnusedFragments(); int count = items.length; int c, n; String separator; float diff; // work through objects and eliminate shadows or roll together overlaps for (int p = 0; p < count; p++) { // master item c = items[p]; // ignore used items if (this.isUsed[c] == false) { // work out mid point in text float midX = (this.f_x1[c] + this.f_x2[c]) / 2; float midY = (this.f_y1[c] + this.f_y2[c]) / 2; for (int p2 = p + 1; p2 < count; p2++) { // item to test against n = items[p2]; if ((this.isUsed[n] == false) && (this.isUsed[c] == false)) { float fontDiff = this.fontSize[n] - this.fontSize[c]; if (fontDiff < 0) fontDiff = -fontDiff; diff = (this.f_x2[n] - this.f_x1[n]) - (this.f_x2[c] - this.f_x1[c]); if (diff < 0) diff = -diff; /** stop spurious matches on overlapping text */ if (fontDiff == 0 && (midX > this.f_x1[n]) && (midX < this.f_x2[n]) && (diff < 10) && (midY < this.f_y1[n]) && (midY > this.f_y2[n])) { this.isUsed[n] = true; // pick up drowned text items (item inside another) } else { boolean a_in_b = (this.f_x1[n] > this.f_x1[c]) && (this.f_x2[n] < this.f_x2[c]) && (this.f_y1[n] < this.f_y1[c]) && (this.f_y2[n] > this.f_y2[c]); boolean b_in_a = (this.f_x1[c] > this.f_x1[n]) && (this.f_x2[c] < this.f_x2[n]) && (this.f_y1[c] < this.f_y1[n]) && (this.f_y2[c] > this.f_y2[n]); // merge together if (a_in_b || b_in_a) { // get order right - bottom y2 underneath if (this.f_y2[c] > this.f_y2[n]) { separator = getLineDownSeparator(this.content[c], this.content[n], this.isXMLExtraction); if ((avoidSpaces == false) || (separator.indexOf(' ') == -1)) { merge(c, n, separator, true); } } else { separator = getLineDownSeparator(this.content[n], this.content[c], this.isXMLExtraction); if (!avoidSpaces || separator.indexOf(' ') == -1) { merge(n, c, separator, true); } } // recalculate as may have changed midX = (this.f_x1[c] + this.f_x2[c]) / 2; midY = (this.f_y1[c] + this.f_y2[c]) / 2; } } } } } } } /** * general routine to see if we add a space between 2 text fragments */ final private String isGapASpace(int c, int l, float actualGap, boolean addMultiplespaceXMLTag, int writingMode) { String sep = ""; float gap; // use smaller gap float gapA = this.spaceWidth[c] * this.fontSize[c]; float gapB = this.spaceWidth[l] * this.fontSize[l]; if (gapA > gapB) gap = gapB; else gap = gapA; gap = (actualGap / (gap / 1000)); // Round values to closest full integer as float -> int conversion rounds down if (gap > 0.51f && gap < 1) gap = 1; int spaceCount = (int) gap; if (spaceCount > 0) sep = " "; /** add an XML tag to flag multiple spaces */ if (spaceCount > 1 && addMultiplespaceXMLTag && writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) sep = " "; return sep; } /** * merge 2 text fragments together and update co-ordinates */ final private void merge(int m, int c, String separator, boolean moveFont) { // update co-ords if (this.f_x1[m] > this.f_x1[c]) this.f_x1[m] = this.f_x1[c]; if (this.f_y1[m] < this.f_y1[c]) this.f_y1[m] = this.f_y1[c]; if (this.f_x2[m] < this.f_x2[c]) this.f_x2[m] = this.f_x2[c]; if (this.f_y2[m] > this.f_y2[c]) this.f_y2[m] = this.f_y2[c]; if (this.isXMLExtraction) { String test = Fonts.fe; // add color tag if needed and changes if (this.colorExtracted) test = Fonts.fe + GenericColorSpace.ce; // move if needed and add separator if ((moveFont) && (this.content[m].toString().lastIndexOf(test) != -1)) { String master = this.content[m].toString(); this.content[m] = new StringBuilder(master.substring(0, master.lastIndexOf(test))); this.content[m].append(separator); this.content[m].append(master.substring(master.lastIndexOf(test))); } else { this.content[m].append(separator); } // Only map out space if text length is longer than 1 if (this.textLength[c] > 1 && this.content[m].toString().endsWith(" ")) { this.content[m].deleteCharAt(this.content[m].lastIndexOf(" ")); } // use font size of second text (ie at end of merged text) this.fontSize[m] = this.fontSize[c]; // Remove excess / redundent xml tags if (this.content[c].indexOf("", this.content[m].lastIndexOf("") + 7 == this.content[m].lastIndexOf(">")) { this.content[c].replace(this.content[c].indexOf("") + 1, ""); this.content[m].replace(this.content[m].lastIndexOf(""), this.content[m].lastIndexOf("") + 8, ""); } } if (this.content[c].indexOf("", this.content[m].lastIndexOf("") + 6 == this.content[m].lastIndexOf(">")) { this.content[c].replace(this.content[c].indexOf("") + 1, ""); this.content[m].replace(this.content[m].lastIndexOf(""), this.content[m].lastIndexOf("") + 7, ""); } } this.content[m] = this.content[m].append(this.content[c]); // track length of text less all tokens this.textLength[m] = this.textLength[m] + this.textLength[c]; // set objects to null to flush and log as used this.isUsed[c] = true; this.content[c] = null; } else { // use font size of second text (ie at end of merged text) this.fontSize[m] = this.fontSize[c]; // add together this.content[m] = this.content[m].append(separator).append(this.content[c]); // track length of text less all tokens this.textLength[m] = this.textLength[m] + this.textLength[c]; // set objects to null to flush and log as used this.isUsed[c] = true; this.content[c] = null; } } /** * remove width data we may have buried in data */ final private void removeEncoding() { // get list of items int[] items = getUnusedFragments(); int current; // work through objects and eliminate shadows or roll together overlaps for (int item : items) { // master item current = item; // ignore used items and remove widths we hid in data if (this.isUsed[current] == false) this.content[current] = removeHiddenMarkers(current); } } /** * put raw data into Arrays for quick merging breakup_fragments shows if we break on vertical lines and spaces */ final private void copyToArrays() { this.colorExtracted = this.pdf_data.isColorExtracted(); int count = this.pdf_data.getRawTextElementCount(); // local lists for faster access this.isUsed = new boolean[count]; this.fontSize = new int[count]; this.writingMode = new int[count]; this.spaceWidth = new float[count]; this.content = new StringBuilder[count]; this.textLength = new int[count]; this.f_x1 = new float[count]; this.f_colorTag = new String[count]; this.f_x2 = new float[count]; this.f_y1 = new float[count]; this.f_y2 = new float[count]; this.moveType = new int[count]; // set values for (int i = 0; i < count; i++) { this.content[i] = new StringBuilder(this.pdf_data.contents[i]); this.fontSize[i] = this.pdf_data.f_end_font_size[i]; this.writingMode[i] = this.pdf_data.f_writingMode[i]; this.f_x1[i] = this.pdf_data.f_x1[i]; this.f_colorTag[i] = this.pdf_data.colorTag[i]; this.f_x2[i] = this.pdf_data.f_x2[i]; this.f_y1[i] = this.pdf_data.f_y1[i]; this.f_y2[i] = this.pdf_data.f_y2[i]; this.moveType[i] = this.pdf_data.move_command[i]; this.spaceWidth[i] = this.pdf_data.space_width[i]; this.textLength[i] = this.pdf_data.text_length[i]; } } /** * get list of unused fragments and put in list */ private int[] getUnusedFragments() { int total_fragments = this.isUsed.length; // get unused item pointers int ii = 0; int temp_index[] = new int[total_fragments]; for (int i = 0; i < total_fragments; i++) { if (this.isUsed[i] == false) { temp_index[ii] = i; ii++; } } // put into correctly sized array int[] items = new int[ii]; System.arraycopy(temp_index, 0, items, 0, ii); return items; } /** * strip the hidden numbers of position we encoded into the data (could be coded to be faster by not using Tokenizer) */ private StringBuilder removeHiddenMarkers(int c) { // make sure has markers and ignore if not if (this.content[c].indexOf(MARKER) == -1) return this.content[c]; // strip the markers StringTokenizer tokens = new StringTokenizer(this.content[c].toString(), MARKER, true); String temp; StringBuilder processedData = new StringBuilder(); // with a token to make sure cleanup works while (tokens.hasMoreTokens()) { // strip encoding in data temp = tokens.nextToken(); // see if first marker if (temp.equals(MARKER)) { tokens.nextToken(); // point character starts tokens.nextToken(); // second marker tokens.nextToken(); // width tokens.nextToken(); // third marker // put back chars processedData = processedData.append(tokens.nextToken()); } else processedData = processedData.append(temp); } return processedData; } /** * sets if we include HTML in teasers (do we want this is word or this is word as teaser) * * @param value */ public void setIncludeHTML(boolean value) { this.includeHTMLtags = value; } /** * method to show data without encoding */ public static String removeHiddenMarkers(String contents) { // trap null if (contents == null) return null; // run though the string extracting our markers // make sure has markers and ignore if not if (!contents.contains(MARKER)) return contents; // strip the markers StringTokenizer tokens = new StringTokenizer(contents, MARKER, true); String temp_token=null; StringBuilder processed_data = new StringBuilder(); boolean pushBackByOne = false; // with a token to make sure cleanup works while (tokens.hasMoreTokens()) { if(!pushBackByOne) { // encoding in data temp_token = tokens.nextToken(); // see if first marker } else { //skip fetching nextToken() since it was fetched in the last round pushBackByOne=false; } if (MARKER.equals(temp_token)) { tokens.nextToken(); // point character starts tokens.nextToken(); // second marker tokens.nextToken(); // width tokens.nextToken(); // third marker //Lonzak: There are PDFs which contain \0\0 (should be e.g. \0 \0 or \0c\0...) and then the lexer gets confused //thus do a push back String next = tokens.nextToken(); if(next.equals(MARKER)) { pushBackByOne=true; } else { // put back chars processed_data = processed_data.append(next); } } else { // value processed_data = processed_data.append(temp_token); } } return processed_data.toString(); } /** * Method to try and find vertical lines in close data (not as efficient as it could be) * * @throws PdfException */ private void findVerticalLines(float minX, float minY, float maxX, float maxY, int currentWritingMode) throws PdfException { // hold counters on all x values HashMap xLines = new HashMap(); // counter on most popular item int most_frequent = 0, count = this.pdf_data.getRawTextElementCount(); float x1, x2, y1, y2; String raw; for (int i = 0; i < count; i++) { float currentX = 0, lastX; Integer intX; // extract values for data raw = this.pdf_data.contents[i]; /** * set pointers so left to right text */ if (currentWritingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) { x1 = this.f_x1[i]; x2 = this.f_x2[i]; y1 = this.f_y1[i]; y2 = this.f_y2[i]; } else if (currentWritingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) { x2 = this.f_x1[i]; x1 = this.f_x2[i]; y1 = this.f_y1[i]; y2 = this.f_y2[i]; } else if (currentWritingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) { x1 = this.f_y1[i]; x2 = this.f_y2[i]; y1 = this.f_x2[i]; y2 = this.f_x1[i]; } else if (currentWritingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) { x1 = this.f_y2[i]; x2 = this.f_y1[i]; y2 = this.f_x1[i]; y1 = this.f_x2[i]; } else { throw new PdfException("Illegal value " + currentWritingMode + "for currentWritingMode"); } // if in the area, process if ((x1 > minX - .5) && (x2 < maxX + .5) && (y2 > minY - .5) && (y1 < maxY + .5)) { // run though the string extracting our markers to get x values StringTokenizer tokens = new StringTokenizer(raw, MARKER, true); String value, lastValue = ""; Object currentValue; while (tokens.hasMoreTokens()) { // encoding in data value = tokens.nextToken(); // see if first marker if (value.equals(MARKER)) { value = tokens.nextToken(); // point character starts if (value.length() > 0) { lastX = currentX; currentX = Float.parseFloat(value); try { // add x to list or increase counter at start // or on space // add points either side of space if (lastValue.length() == 0 || (lastValue.indexOf(' ') != -1)) { intX = (int) currentX; currentValue = xLines.get(intX); if (currentValue == null) { xLines.put(intX, 1); } else { int countReached = (Integer) currentValue; countReached++; if (countReached > most_frequent) most_frequent = countReached; xLines.put(intX, countReached); } // work out the middle int middle = (int) (lastX + ((currentX - lastX) / 2)); if (lastX != 0) { intX = middle; currentValue = xLines.get(intX); if (currentValue == null) { xLines.put(intX, 1); } else { int count_reached = (Integer) currentValue; count_reached++; if (count_reached > most_frequent) most_frequent = count_reached; xLines.put(intX, count_reached); } } } } catch (Exception e) { LogWriter.writeLog("Exception " + e + " stripping x values"); } } tokens.nextToken(); // second marker tokens.nextToken(); // glyph width tokens.nextToken(); // third marker value = tokens.nextToken(); // put back chars lastValue = value; } } } } // now analyse the data Iterator keys = xLines.keySet().iterator(); int minimum_needed = most_frequent / 2; while (keys.hasNext()) { Integer current_key = (Integer) keys.next(); int current_count = (Integer) xLines.get(current_key); if (current_count > minimum_needed) this.lineBreaks.addElement(current_key); } } /** * Method splitFragments adds raw frgaments to processed fragments breaking up any with vertical lines through or what looks like tabbed spaces * * @throws PdfException */ private void copyToArrays(float minX, float minY, float maxX, float maxY, boolean keepFont, boolean breakOnSpace, boolean findLines, String punctuation, boolean isWordlist) throws PdfException { final boolean debugSplit = false; // initialise local arrays allow for extra space int count = this.pdf_data.getRawTextElementCount() + increment; this.f_x1 = new float[count]; this.f_colorTag = new String[count]; this.hadSpace = new boolean[count]; this.f_x2 = new float[count]; this.f_y1 = new float[count]; this.f_y2 = new float[count]; this.spaceWidth = new float[count]; this.content = new StringBuilder[count]; this.fontSize = new int[count]; this.textLength = new int[count]; this.writingMode = new int[count]; this.isUsed = new boolean[count]; this.moveType = new int[count]; // flag to find lines based on orientation of first text item*/ boolean linesScanned = false; // set defaults and calculate dynamic values int text_length; count = count - increment; float last_pt, min, max, pt, x1, x2, y1, y2, linePos, character_spacing; String raw, char_width = "", currentColor; StringBuilder text = new StringBuilder(); // work through fragments for (int i = 0; i < count; i++) { // extract values character_spacing = this.pdf_data.f_character_spacing[i]; raw = this.pdf_data.contents[i]; x1 = this.pdf_data.f_x1[i]; currentColor = this.pdf_data.colorTag[i]; x2 = this.pdf_data.f_x2[i]; y1 = this.pdf_data.f_y1[i]; y2 = this.pdf_data.f_y2[i]; text_length = this.pdf_data.text_length[i]; int mode = this.pdf_data.f_writingMode[i]; int moveType = this.pdf_data.move_command[i]; /** * see if in area */ boolean accepted = false; if (debugSplit) { System.out.println("raw data=" + raw); System.out.println("text data=" + PdfGroupingAlgorithms.removeHiddenMarkers(raw)); } // if at least partly in the area, process if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) && y2 > minY && y1 < maxY && x1 < maxX && x2 > minX) { accepted = true; } else if ((mode == PdfData.VERTICAL_BOTTOM_TO_TOP || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) && x1 > minX && x2 < maxX && y1 > minY && y2 < maxY) accepted = true; if (accepted) { /** find lines */ // look for possible vertical or horizontal lines in the data if ((!linesScanned) && (findLines)) { findVerticalLines(minX, minY, maxX, maxY, mode); linesScanned = true; } /** * initialise pointers and work out an 'average character space' **/ if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) { // space = (x2 - x1) / text_length; pt = x1; last_pt = x1; min = minX; max = maxX; } else { // vertical text // space = (y1 - y2) / text_length; pt = y2; last_pt = y2; min = minY; max = maxY; } linePos = -1; /** * work through text, using embedded markers to work out whether each letter is IN or OUT */ char[] line = raw.toCharArray(); int end = line.length; int pointer = 0; String value, textValue = "", pt_reached; // allow for no tokens and return all text fragment if (!raw.contains(MARKER)) text = new StringBuilder(raw); boolean isFirstValue = true, breakPointset = false; /** * work through text, using embedded markers to work out whether each letter is IN or OUT */ while (pointer < end) { // only data between min and y locations while (true) { /** * read value */ if (line[pointer] != MARKER2) { // find second marker and get width int startPointer = pointer; while ((pointer < end) && (line[pointer] != MARKER2)) pointer++; value = raw.substring(startPointer, pointer); } else {// if (value.equals(MARKER)) { // read the next token and its location and width // find first marker while ((pointer < end) && (line[pointer] != MARKER2)) pointer++; pointer++; // find second marker and get width int startPointer = pointer; while ((pointer < end) && (line[pointer] != MARKER2)) pointer++; pt_reached = raw.substring(startPointer, pointer); pointer++; // find third marker startPointer = pointer; while ((pointer < end) && (line[pointer] != MARKER2)) pointer++; char_width = raw.substring(startPointer, pointer); pointer++; // find next marker startPointer = pointer; while ((pointer < end) && (line[pointer] != MARKER2)) pointer++; value = raw.substring(startPointer, pointer); textValue = value; // keep value with no spaces if (pt_reached.length() > 0) { // set point character starts last_pt = pt; pt = Float.parseFloat(pt_reached); if (breakPointset) { if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) x1 = pt; else if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) x2 = pt; else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) y2 = pt; else if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) y1 = pt; breakPointset = false; } } // add font start if needed if ((this.isXMLExtraction) && (last_pt < min) && (pt > min) && (!value.startsWith(Fonts.fb))) value = Fonts .getActiveFontTag(raw, "") + value; } if ((pt > min) & (pt < max)) { if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) if ((x1 < min || x1 > max) && pt >= min) x1 = pt; else if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) if ((x2 > max || x2 < min) && pt <= max) x2 = pt; else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) if ((y2 < min || y2 > max) && pt >= min) y2 = pt; else if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) if ((y1 < min || y1 > max) && pt <= min) y1 = pt; break; } value = ""; textValue = ""; if (pointer >= end) break; } /** make sure font not sliced off on first value */ if ((isFirstValue)) { isFirstValue = false; if ((this.isXMLExtraction) && (keepFont) && (!value.startsWith(Fonts.fb)) && (!value.startsWith(GenericColorSpace.cb))) // &&(!text.toString().startsWith(Fonts.fb)))) text.append(Fonts.getActiveFontTag(text.toString(), raw)); } /** * we now have a valid value inside the selected area so perform tests */ // see if a break occurs boolean is_broken = false; if (findLines && character_spacing > 0 && text.toString().endsWith(" ")) { int counts = this.lineBreaks.size(); for (int jj = 0; jj < counts; jj++) { int test_x = this.lineBreaks.elementAt(jj); if ((last_pt < test_x) & (pt > test_x)) { jj = counts; is_broken = true; } } } boolean endsWithPunctuation = checkForPunctuation(textValue, punctuation); if (is_broken) { // break on double-spaces or larger if (debugSplit) System.out.println("Break 1 is_broken"); float Nx1 = x1, Nx2 = x2, Ny1 = y1, Ny2 = y2; if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) Nx2 = last_pt + Float.parseFloat(char_width); else if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) Nx1 = last_pt + Float.parseFloat(char_width); else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) Ny1 = last_pt + Float.parseFloat(char_width); else if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) Ny2 = last_pt + Float.parseFloat(char_width); addFragment(moveType, i, text, Nx1, Nx2, Ny1, Ny2, text_length, keepFont, currentColor, isWordlist); text = new StringBuilder(Fonts.getActiveFontTag(text.toString(), raw)); text.append(value); if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) x1 = pt; else if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) x2 = pt; else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) y2 = pt; else if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) y1 = pt; } else if ((endsWithPunctuation) | ((breakOnSpace) && ((textValue.indexOf(' ') != -1) || (value.endsWith(" ")))) | ((textValue.contains(" ")))) {// break on double-spaces or larger if (debugSplit) System.out.println("Break 2 endsWithPunctuation=" + endsWithPunctuation + " textValue=" + textValue + '<' + " value=" + value + '<' + " text=" + text + '<'); // Remove final bit of the below if to fix issue in case 11542 if (textValue.length() > 1 && textValue.indexOf(' ') != -1) {// && x1==pt){ //add in space values to start of next shape // count the spaces int ptr = textValue.indexOf(' '); if (ptr > 0) { pt = pt + ptr * (Float.parseFloat(char_width) / textValue.length()); } // else // pt=pt+Float.parseFloat(char_width); } if (!endsWithPunctuation) text.append(value.trim()); if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) { if (debugSplit) System.out.println("Add " + x1 + ' ' + pt + " text=" + text + " i=" + i); addFragment(moveType, i, text, x1, pt, y1, y2, text_length, keepFont, currentColor, isWordlist); } else if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) { if (debugSplit) System.out.println("b"); addFragment(moveType, i, text, pt, x2, y1, y2, text_length, keepFont, currentColor, isWordlist); } else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) { if (debugSplit) System.out.println("c"); addFragment(moveType, i, text, x1, x2, pt, y2, text_length, keepFont, currentColor, isWordlist); } else if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) { if (debugSplit) System.out.println("d"); addFragment(moveType, i, text, x1, x2, y1, pt, text_length, keepFont, currentColor, isWordlist); } if (char_width.length() > 0) { // add in space values to start of next shape // count the spaces int ptr = 0; if (textValue.indexOf(' ') != -1) ptr = textValue.indexOf(' '); if (isWordlist) { int len = textValue.length(); while (ptr < len && textValue.charAt(ptr) == ' ') { ptr++; } } if (ptr > 0) pt = pt + ptr * Float.parseFloat(char_width); else pt = pt + Float.parseFloat(char_width); if (ptr > 0) breakPointset = true; else breakPointset = false; } // store fact it had a space in case we generate wordlist if ((breakOnSpace) & (this.nextSlot > 0)) this.hadSpace[this.nextSlot - 1] = true; text = new StringBuilder(Fonts.getActiveFontTag(text.toString(), raw)); if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) x1 = pt;// + space; else if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) x2 = pt;// - space; else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) y2 = pt;// + space; else if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) y1 = pt;// - space; } else if ((linePos != -1) & (pt > linePos)) {// break on a vertical line if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) addFragment(moveType, i, text, x1, linePos, y1, y2, text_length, keepFont, currentColor, isWordlist); else if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) addFragment(moveType, i, text, linePos, x2, y1, y2, text_length, keepFont, currentColor, isWordlist); else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) addFragment(moveType, i, text, x1, x2, linePos, y2, text_length, keepFont, currentColor, isWordlist); else if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) addFragment(moveType, i, text, x1, x2, y1, linePos, text_length, keepFont, currentColor, isWordlist); text = new StringBuilder(Fonts.getActiveFontTag(text.toString(), raw)); text.append(value); if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) x1 = linePos; else if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) x2 = linePos; else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) y2 = linePos; else if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) y1 = linePos; linePos = -1; } else { // allow for space used as tab if ((this.isXMLExtraction) && (value.endsWith(' ' + Fonts.fe))) { value = Fonts.fe; textValue = ""; if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) x2 = last_pt; else if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) x1 = last_pt; else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) y1 = last_pt; else if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) y2 = last_pt; } text.append(value); } } // trap scenario we found if all goes through with no break at end if ((keepFont) && (this.isXMLExtraction) && (!text.toString().endsWith(Fonts.fe)) && (!text.toString().endsWith(GenericColorSpace.ce))) text.append(Fonts.fe); // create new line with what is left and output if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) { if (x1 < x2) addFragment(moveType, i, text, x1, x2, y1, y2, text_length, keepFont, currentColor, isWordlist); } else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) { if (y1 > y2) addFragment(moveType, i, text, x1, x2, y1, y2, text_length, keepFont, currentColor, isWordlist); } text = new StringBuilder(); } } // local lists for faster access this.isUsed = new boolean[this.nextSlot]; } /** * @param textValue */ private static boolean checkForPunctuation(String textValue, String punctuation) { if (punctuation == null) return false; /** see if ends with punctuation */ boolean endsWithPunctuation = false; int textLength = textValue.length(); int ii = textLength - 1; if (textLength > 0) { // strip any spaces and tags in test char testChar = textValue.charAt(ii); boolean inTag = (testChar == '>'); while (((inTag) | (testChar == ' ')) & (ii > 0)) { if (testChar == '<') inTag = false; ii--; testChar = textValue.charAt(ii); if (testChar == '>') inTag = true; } // stop matches on &; if ((testChar == ';')) { // ignore if looks like &xxx; endsWithPunctuation = true; ii--; while (ii > -1) { testChar = textValue.charAt(ii); if (testChar == '&' || testChar == '#') { endsWithPunctuation = false; ii = 0; } if (ii == 0 || testChar == ' ' || !Character.isLetterOrDigit(testChar)) break; ii--; } } else if (punctuation.indexOf(testChar) != -1) endsWithPunctuation = true; } return endsWithPunctuation; } /** * add an object to our new XML list */ private void addFragment(int moveType, int index, StringBuilder contentss, float x1, float x2, float y1, float y2, int text_len, boolean keepFontTokens, String currentColorTag, boolean isWordlist) { StringBuilder current_text = contentss; String str = current_text.toString(); // strip <> or ascii equivalents if (isWordlist) { if (str.contains("&#")) current_text = Strip.stripAmpHash(current_text); if ((this.isXMLExtraction) && ((str.contains("<")) || (str.contains(">")))) current_text = Strip.stripXMLArrows(current_text, true); else if ((!this.isXMLExtraction) && ((str.indexOf('<') != -1) || (str.indexOf('>') != -1))) current_text = Strip.stripArrows(current_text); } // StringBuilder justText=Strip.stripXML(current_text); // ignore blank space objects // if (justText.length() == 0) { if (getFirstChar(current_text) != -1) { // strip tags or pick up missed if ends with space if (keepFontTokens == false) { // strip fonts if required current_text = Strip.stripXML(current_text, this.isXMLExtraction); } else if (this.isXMLExtraction) { // no color tag if (this.pdf_data.isColorExtracted() && (!current_text.toString().endsWith(GenericColorSpace.ce))) { // se // if ends add // otherwise add if (!current_text.toString().endsWith(Fonts.fe)) current_text = current_text.append(Fonts.fe); current_text = current_text.append(GenericColorSpace.ce); } else if ((!this.pdf_data.isColorExtracted()) && (!current_text.toString().endsWith(Fonts.fe))) current_text = current_text .append(Fonts.fe); } // add to vacant slot or create new slot int count = this.f_x1.length; if (this.nextSlot < count) { this.f_x1[this.nextSlot] = x1; this.f_colorTag[this.nextSlot] = currentColorTag; this.f_x2[this.nextSlot] = x2; this.f_y1[this.nextSlot] = y1; this.f_y2[this.nextSlot] = y2; this.moveType[this.nextSlot] = moveType; this.fontSize[this.nextSlot] = this.pdf_data.f_end_font_size[index]; this.writingMode[this.nextSlot] = this.pdf_data.f_writingMode[index]; this.textLength[this.nextSlot] = text_len; this.spaceWidth[this.nextSlot] = this.pdf_data.space_width[index]; this.content[this.nextSlot] = current_text; this.nextSlot++; } else { count = count + increment; float[] t_x1 = new float[count]; String[] t_colorTag = new String[count]; float[] t_x2 = new float[count]; float[] t_y1 = new float[count]; float[] t_y2 = new float[count]; float[] t_spaceWidth = new float[count]; StringBuilder[] t_content = new StringBuilder[count]; int[] t_font_size = new int[count]; int[] t_text_len = new int[count]; int[] t_writingMode = new int[count]; int[] t_moveType = new int[count]; boolean[] t_isUsed = new boolean[count]; boolean[] t_hadSpace = new boolean[count]; // copy in existing for (int i = 0; i < count - increment; i++) { t_x1[i] = this.f_x1[i]; t_colorTag[i] = this.f_colorTag[i]; t_x2[i] = this.f_x2[i]; t_y1[i] = this.f_y1[i]; t_y2[i] = this.f_y2[i]; t_hadSpace[i] = this.hadSpace[i]; t_spaceWidth[i] = this.spaceWidth[i]; t_content[i] = this.content[i]; t_font_size[i] = this.fontSize[i]; t_writingMode[i] = this.writingMode[i]; t_text_len[i] = this.textLength[i]; t_isUsed[i] = this.isUsed[i]; t_moveType[i] = this.moveType[i]; } this.f_x1 = t_x1; this.f_colorTag = t_colorTag; this.hadSpace = t_hadSpace; this.f_x2 = t_x2; this.f_y1 = t_y1; this.f_y2 = t_y2; this.isUsed = t_isUsed; this.fontSize = t_font_size; this.writingMode = t_writingMode; this.textLength = t_text_len; this.spaceWidth = t_spaceWidth; this.content = t_content; this.moveType = t_moveType; this.f_x1[this.nextSlot] = x1; this.f_colorTag[this.nextSlot] = currentColorTag; this.f_x2[this.nextSlot] = x2; this.f_y1[this.nextSlot] = y1; this.f_y2[this.nextSlot] = y2; this.fontSize[this.nextSlot] = this.pdf_data.f_end_font_size[index]; this.writingMode[this.nextSlot] = this.pdf_data.f_writingMode[index]; t_text_len[this.nextSlot] = text_len; this.content[this.nextSlot] = current_text; this.spaceWidth[this.nextSlot] = this.pdf_data.space_width[index]; this.moveType[this.nextSlot] = moveType; this.nextSlot++; } } } // //////////////////////////////////////////////////////////////////// /** * put rows together into one object with start and end */ private void mergeTableRows(int border_width) { // merge row contents String separator = "\n"; if (this.isXHTML == false) separator = "\n"; this.master = ((Vector_Int) this.lines.elementAt(this.line_order[0])).elementAt(0); int item; for (int rr = 1; rr < this.max_rows; rr++) { item = ((Vector_Int) this.lines.elementAt(this.line_order[rr])).elementAt(0); if (this.content[this.master] == null) this.master = item; else if (this.content[item] != null) merge(this.master, item, separator, false); } // add start/end marker if (this.isXHTML) { if (border_width == 0) { this.content[this.master].insert(0, "\n"); this.content[this.master].append("\n
\n"); } else { StringBuilder startTag = new StringBuilder("\n"); startTag.append(this.content[this.master]); this.content[this.master] = startTag; this.content[this.master].append("\n
\n"); } } } // //////////////////////////////////////////////// /** * get list of unused fragments and put in list and sort in sorted_items */ final private int[] getsortedUnusedFragments(boolean sortOnX, boolean use_y1) { int total_fragments = this.isUsed.length; // get unused item pointers int ii = 0; int sorted_temp_index[] = new int[total_fragments]; for (int i = 0; i < total_fragments; i++) { if (this.isUsed[i] == false) { sorted_temp_index[ii] = i; ii++; } } int[] unsorted_items = new int[ii]; int[] sorted_items; int[] sorted_temp_x1 = new int[ii]; int[] sorted_temp_y1 = new int[ii]; int[] sorted_temp_y2 = new int[ii]; // put values in array and get x/y for sort for (int pointer = 0; pointer < ii; pointer++) { int i = sorted_temp_index[pointer]; unsorted_items[pointer] = i; sorted_temp_x1[pointer] = (int) this.f_x1[i]; // negative values to get sort in 'wrong' order from top of page sorted_temp_y1[pointer] = (int) this.f_y1[i]; sorted_temp_y2[pointer] = (int) this.f_y2[i]; } // sort if (sortOnX == false) { if (use_y1 == true) sorted_items = Sorts.quicksort(sorted_temp_y1, sorted_temp_x1, unsorted_items); else sorted_items = Sorts.quicksort(sorted_temp_y2, sorted_temp_x1, unsorted_items); } else sorted_items = Sorts.quicksort(sorted_temp_x1, sorted_temp_y1, unsorted_items); return sorted_items; } // //////////////////////////////////////////////////////////////////// /** * create rows of data from preassembled indices, adding separators. Each row is built to a temp array and then row created - we don't know how * many columns until the table is built * * @throws PdfException */ private void createTableRows(boolean keep_alignment_information, boolean keep_width_information, int currentWritingMode) throws PdfException { /** * create local copies of arrays */ float[] f_x1, f_x2; /** * set pointers so left to right text */ if (currentWritingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) { f_x1 = this.f_x1; f_x2 = this.f_x2; // f_y1=this.f_y1; // f_y2=this.f_y2; } else if (currentWritingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) { f_x2 = this.f_x1; f_x1 = this.f_x2; // f_y1=this.f_y1; // f_y2=this.f_y2; } else if (currentWritingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) { f_x1 = this.f_y2; f_x2 = this.f_y1; // f_y1=this.f_x2; // f_y2=this.f_x1; } else if (currentWritingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) { f_x1 = this.f_y1; f_x2 = this.f_y2; // f_y2=this.f_x1; // f_y1=this.f_x2; /** * fiddle x,y co-ords so it works */ // get max size int maxX = 0; for (float aF_x1 : f_x1) { if (maxX < aF_x1) maxX = (int) aF_x1; } maxX++; // allow for fp error // turn around for (int ii = 0; ii < f_x2.length; ii++) { f_x1[ii] = maxX - f_x1[ii]; f_x2[ii] = maxX - f_x2[ii]; } } else { throw new PdfException("Illegal value " + currentWritingMode + "for currentWritingMode"); } int item, i;// , current_col = -1; int itemsInTable = 0, items_added = 0; // pointer to current element on each row int[] currentItem = new int[this.max_rows]; Vector_Int[] rowContents = new Vector_Int[this.max_rows]; Vector_String alignments = new Vector_String(); // text alignment Vector_Float widths = new Vector_Float(); // cell widths Vector_Float cell_x1 = new Vector_Float(); // cell widths String separator = "", empty_cell = " "; if (this.isXHTML == false) { separator = "\",\""; empty_cell = ""; } /** * set number of items on each line, column count and populate empty rows */ int[] itemCount = new int[this.max_rows]; for (i = 0; i < this.max_rows; i++) { itemCount[i] = ((Vector_Int) this.lines.elementAt(i)).size() - 1; // total number of items itemsInTable = itemsInTable + itemCount[i]; // reset other values currentItem[i] = 0; rowContents[i] = new Vector_Int(20); } // now work through and split any overlapping items until all done while (true) { // size of column and pointers float x1 = 9999, min_x2 = 9999, x2, current_x1, current_x2, c_x1, next_x1 = 9999, c_x2, items_in_column = 0; boolean all_done = true; // flag to exit at end float total_x1 = 0, total_x2 = 0, left_gap = 0, right_gap; String alignment = "center"; if (items_added < itemsInTable) { /** * work out cell x boundaries on basis of objects */ for (i = 0; i < this.max_rows; i++) { // get width for column if (itemCount[i] > currentItem[i]) { // item id item = ((Vector_Int) this.lines.elementAt(i)).elementAt(currentItem[i]); current_x1 = f_x1[item]; current_x2 = f_x2[item]; if (current_x1 < x1) // left margin x1 = current_x1; if (current_x2 < min_x2) // right margin if appropriate min_x2 = current_x2; } } cell_x1.addElement(x1); // save left margin x2 = min_x2; // set default right margin /** * workout end and next column start by scanning all items */ for (i = 0; i < this.max_rows; i++) { // slot the next item on each row together work out item item = ((Vector_Int) this.lines.elementAt(i)).elementAt(currentItem[i]); c_x1 = f_x1[item]; c_x2 = f_x2[item]; // max item width of this column if ((c_x1 >= x1) & (c_x1 < min_x2) & (c_x2 > x2)) x2 = c_x2; if (currentItem[i] < itemCount[i]) { // next left margin item = ((Vector_Int) this.lines.elementAt(i)).elementAt(currentItem[i] + 1); current_x1 = f_x1[item]; if ((current_x1 > min_x2) & (current_x1 < next_x1)) next_x1 = current_x1; } } // stop infinite loop case if (x1 == x2) break; // allow for last column if (next_x1 == 9999) next_x1 = x2; /** * count items in table and workout raw totals for alignment. Also work out widest x2 in column */ for (i = 0; i < this.max_rows; i++) { // slot the next item on each row together // work out item item = ((Vector_Int) this.lines.elementAt(i)).elementAt(currentItem[i]); c_x1 = f_x1[item]; c_x2 = f_x2[item]; // use items in first column of single colspan if ((c_x1 >= x1) & (c_x1 < min_x2) & (c_x2 <= next_x1)) { // running totals to calculate alignment total_x1 = total_x1 + c_x1; total_x2 = total_x2 + c_x2; items_in_column++; } } /** * work out gap and include empty space between cols and save */ if (i == 0) left_gap = x1; if (next_x1 == -1) right_gap = 0; else right_gap = (int) ((next_x1 - x2) / 2); int width = (int) (x2 - x1 + right_gap + left_gap); // noinspection UnusedAssignment,UnusedAssignment left_gap = right_gap; widths.addElement(width); /** workout the alignment */ float x1_diff = (total_x1 / items_in_column) - x1; float x2_diff = x2 - (total_x2 / items_in_column); if (x1_diff < 1) alignment = "left"; else if (x2_diff < 1) alignment = "right"; alignments.addElement(alignment); for (i = 0; i < this.max_rows; i++) { // slot the next item on each row together this.master = ((Vector_Int) this.lines.elementAt(i)).elementAt(0); // get next item on line or -1 for no more if (itemCount[i] > currentItem[i]) { // work out item item = ((Vector_Int) this.lines.elementAt(i)).elementAt(currentItem[i]); c_x1 = f_x1[item]; c_x2 = f_x2[item]; all_done = false; } else { item = -1; c_x1 = -1; c_x2 = -1; } if ((item == -1) & (items_added <= itemsInTable)) { // all items in table so just filling in gaps rowContents[i].addElement(-1); } else if ((c_x1 >= x1) & (c_x1 < x2)) { // fits into cell so add in and roll on marker rowContents[i].addElement(item); currentItem[i]++; items_added++; } else if (c_x1 > x2) { // empty cell rowContents[i].addElement(-1); } } } if (all_done) break; } // =================================================================== /** * now assemble rows */ for (int row = 0; row < this.max_rows; row++) { StringBuilder line_content = new StringBuilder(100); int count = rowContents[row].size() - 1; this.master = ((Vector_Int) this.lines.elementAt(row)).elementAt(0); for (i = 0; i < count; i++) { item = rowContents[row].elementAt(i); if (this.isXHTML) { // get width float current_width = widths.elementAt(i); String current_alignment = alignments.elementAt(i); int test, colspan = 1, pointer = i + 1; if (item != -1) { // look for colspan while (true) { test = rowContents[row].elementAt(i + 1); if ((test != -1) | (count == i + 1)) break; // break if over another col - roll up single value on line if ((itemCount[row] > 1) & (cell_x1.elementAt(i + 1) > f_x2[item])) break; count--; rowContents[row].removeElementAt(i + 1); colspan++; // update width current_width = current_width + widths.elementAt(pointer); pointer++; } } line_content.append(" 1) line_content.append(" colspan='").append(colspan).append('\''); } if (keep_width_information) line_content.append(" width='").append((int) current_width).append('\''); line_content.append(" nowrap>"); if (item == -1) line_content.append(empty_cell); else line_content.append(this.content[item]); line_content.append(""); } else { // csv if (item == -1) // empty col line_content.append("\"\","); else { // value line_content.append('\"'); line_content.append(this.content[item]); line_content.append("\","); } } // merge to update other values if ((item != -1) && (this.master != item)) // merge tracks the shape merge(this.master, item, separator, false); } // substitute our 'hand coded' value this.content[this.master] = line_content; } } /** * work through data and create a set of rows and return an object with refs for each line * * @throws PdfException */ private void createLinesInTable(int itemCount, int[] items, boolean addSpaceXMLTag, int mode) throws PdfException { /** * reverse order if text right to left */ if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) items = reverse(items); /** * create and populate local copies of arrays */ float[] f_x1, f_x2, f_y1, f_y2; // set pointers so always left to right text switch (mode) { case PdfData.HORIZONTAL_LEFT_TO_RIGHT: f_x1 = this.f_x1; f_x2 = this.f_x2; f_y1 = this.f_y1; f_y2 = this.f_y2; break; case PdfData.HORIZONTAL_RIGHT_TO_LEFT: f_x2 = this.f_x1; f_x1 = this.f_x2; f_y1 = this.f_y1; f_y2 = this.f_y2; break; case PdfData.VERTICAL_BOTTOM_TO_TOP: f_x1 = this.f_y1; f_x2 = this.f_y2; f_y1 = this.f_x2; f_y2 = this.f_x1; break; case PdfData.VERTICAL_TOP_TO_BOTTOM: f_x1 = this.f_y2; f_x2 = this.f_y1; f_y2 = this.f_x1; f_y1 = this.f_x2; items = this.getsortedUnusedFragments(false, true); items = reverse(items); break; default: throw new PdfException("Illegal value " + mode + "for currentWritingMode"); } // holds line we're working on Vector_Int current_line; for (int j = 0; j < itemCount; j++) { // for all items int c = items[j], id = -1, i, last = c; float smallest_gap = -1, gap, yMidPt; if (!this.isUsed[c] && this.writingMode[c] == mode) { // reset pointer and add this element current_line = new Vector_Int(20); current_line.addElement(c); this.lineY2.addElement((int) f_y2[c]); // look for items along same line (already sorted into order left to right) while (true) { // look for a match for (int ii = 0; ii < itemCount; ii++) { i = items[ii]; if (!this.isUsed[i] && i != c && this.writingMode[c] == mode && ((f_x1[i] > f_x1[c] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM) || (f_x1[i] < f_x1[c] && mode == PdfData.VERTICAL_TOP_TO_BOTTOM))) { // see // if // on // right gap = (f_x1[i] - f_x2[c]); if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) gap = -gap; // allow for fp error if (gap < 0 && gap > -2) gap = 0; // make sure on right yMidPt = (f_y1[i] + f_y2[i]) / 2; // see if line & if only or better fit if (yMidPt < f_y1[c] && yMidPt > f_y2[c] && (smallest_gap < 0 || gap < smallest_gap)) { smallest_gap = gap; id = i; } } } if (id == -1) // exit when no more matches break; // merge in best match if fit found with last or if overlaps by less than half a space,otherwise join float t = f_x1[id] - f_x2[last], possSpace = f_x1[id] - f_x2[c]; float av_char1 = (float) 1.5 * ((f_x2[id] - f_x1[id]) / this.textLength[id]); float av_char2 = (float) 1.5 * ((f_x2[last] - f_x1[last]) / this.textLength[last]); if ((mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM)) { possSpace = -possSpace; t = -t; av_char1 = -av_char1; av_char2 = -av_char2; } if (t < av_char1 && t < av_char2) { merge(last, id, isGapASpace(id, last, possSpace, addSpaceXMLTag, mode), true); } else { current_line.addElement(id); last = id; } // flag used and reset variables used this.isUsed[id] = true; id = -1; smallest_gap = 1000000; } // add line to list this.lines.addElement(current_line); this.max_rows++; } } } /** * * calls various low level merging routines on merge - * * isCSV sets if output is XHTML or CSV format - * * XHTML also has options to include font tags (keepFontInfo), preserve widths (keepWidthInfo), try to preserve alignment (keepAlignmentInfo), and * set a table border width (borderWidth) - AddCustomTags should always be set to false * * @param x1 * is the x coord of the top left corner * @param y1 * is the y coord of the top left corner * @param x2 * is the x coord of the bottom right corner * @param y2 * is the y coord of the bottom right corner * @param pageNumber * is the page you wish to extract from * @param isCSV * is a boolean. If false the output is xhtml if true the text is out as CSV * @param keepFontInfo * if true and isCSV is false keeps font information in extrated text. * @param keepWidthInfo * if true and isCSV is false keeps width information in extrated text. * @param keepAlignmentInfo * if true and isCSV is false keeps alignment information in extrated text. * @param borderWidth * is the width of the border for xhtml * @return Map containing text found in estimated table cells * @throws PdfException * If the co-ordinates are not valid */ public final Map extractTextAsTable(int x1, int y1, int x2, int y2, int pageNumber, boolean isCSV, boolean keepFontInfo, boolean keepWidthInfo, boolean keepAlignmentInfo, int borderWidth) throws PdfException { // check in correct order and throw exception if not int[] v = validateCoordinates(x1, y1, x2, y2); x1 = v[0]; y1 = v[1]; x2 = v[2]; y2 = v[3]; /** return the content as an Element */ Map table_content = new HashMap(); LogWriter.writeLog("extracting Text As Table"); // flag type of table so we can add correct separators if (isCSV == true) { this.isXHTML = false; } else { this.isXHTML = true; } // init table variables this.lines = new Vector_Object(20); this.lineY2 = new Vector_Int(20); this.max_rows = 0; // init store for data copyToArrays(x1, y2, x2, y1, keepFontInfo, false, true, null, false); // initial grouping and delete any hidden text removeEncoding(); // eliminate shadows and also merge overlapping text cleanupShadowsAndDrownedObjects(false); int[] items = this.getsortedUnusedFragments(true, false); int item_count = items.length; // number of items if (item_count == 0) return table_content; /** * check orientation and get preferred. Items not correct will be ignored */ int writingMode = getWritingMode(items, item_count); String message = "Table Merging algorithm being applied " + (item_count) + " items"; LogWriter.writeLog(message); /** * scan all items joining best fit to right of each fragment to build lines */ if (item_count > 1) { // workout the raw lines createLinesInTable(item_count, items, this.isXHTML, writingMode); /** * generate lookup with lines in correct order (minus used to get correct order down the page) */ int dx = 1; if (writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || writingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) dx = -1; this.line_order = new int[this.max_rows]; int[] line_y = new int[this.max_rows]; for (int i = 0; i < this.max_rows; i++) { line_y[i] = dx * this.lineY2.elementAt(i); this.line_order[i] = i; } this.line_order = Sorts.quicksort(line_y, this.line_order); // assemble the rows and columns createTableRows(keepAlignmentInfo, keepWidthInfo, writingMode); // assemble the rows and columns mergeTableRows(borderWidth); } this.content[this.master] = cleanup(this.content[this.master]); String processed_value = this.content[this.master].toString(); if (processed_value != null) { // cleanup data if needed by removing duplicate font tokens if (!isCSV) processed_value = Fonts.cleanupTokens(processed_value); table_content.put("content", processed_value); table_content.put("x1", String.valueOf(x1)); table_content.put("x2", String.valueOf(x2)); table_content.put("y1", String.valueOf(y1)); table_content.put("y2", String.valueOf(y2)); } return table_content; } /** make sure co-ords valid and throw exception if not */ private static int[] validateCoordinates(int x1, int y1, int x2, int y2) { if ((x1 > x2) | (y1 < y2)) { // String errorMessage = "Invalid parameters for text rectangle. "; if (x1 > x2) { // errorMessage = // errorMessage // + "x1 value (" // + x1 // + ") must be LESS than x2 (" // + x2 // + "). "; int temp = x1; x1 = x2; x2 = temp; LogWriter.writeLog("x1 > x2, coordinates were swapped to validate"); } if (y1 < y2) { // errorMessage = // errorMessage // + "y1 value (" // + y1 // + ") must be MORE than y2 (" // + y2 // + "). "; int temp = y1; y1 = y2; y2 = temp; LogWriter.writeLog("y1 < y2, coordinates were swapped to validate"); } // throw new PdfException(errorMessage); } return new int[] { x1, y1, x2, y2 }; } /** * * algorithm to place data from within coordinates to a vector of word, word coords (x1,y1,x2,y2) * * @param x1 * is the x coord of the top left corner * @param y1 * is the y coord of the top left corner * @param x2 * is the x coord of the bottom right corner * @param y2 * is the y coord of the bottom right corner * @param page_number * is the page you wish to extract from * @param breakFragments * will divide up text based on white space characters * @param punctuation * is a string containing all values that should be used to divide up words * @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...) * @throws PdfException * If the co-ordinates are not valid */ final public List extractTextAsWordlist(int x1, int y1, int x2, int y2, int page_number, boolean breakFragments, String punctuation) throws PdfException { /** make sure co-ords valid and throw exception if not */ int[] v = validateCoordinates(x1, y1, x2, y2); x1 = v[0]; y1 = v[1]; x2 = v[2]; y2 = v[3]; /** extract the raw fragments (Note order or parameters passed) */ if (breakFragments) copyToArrays(x1, y2, x2, y1, true, true, false, punctuation, true); else copyToArrays(); /** delete any hidden text */ removeEncoding(); // eliminate shadows and also merge overlapping text cleanupShadowsAndDrownedObjects(true); int[] items = getsortedUnusedFragments(true, false); int count = items.length; /** * if no values return null */ if (count == 0) { LogWriter.writeLog("Less than 1 text item on page"); return null; } /** * check orientation and get preferred. Items not correct will be ignored */ int writingMode = getWritingMode(items, count); /** * build set of lines from text */ createLines(count, items, writingMode, true, false, false); /** * alter co-ords to rotated if requested */ float[] f_x1 = null, f_x2 = null, f_y1 = null, f_y2 = null; if (useUnrotatedCoords || writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) { f_x1 = this.f_x1; f_x2 = this.f_x2; f_y1 = this.f_y1; f_y2 = this.f_y2; } else if (writingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) { f_x2 = this.f_x1; f_x1 = this.f_x2; f_y1 = this.f_y1; f_y2 = this.f_y2; } else if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) { f_x1 = this.f_y2; f_x2 = this.f_y1; f_y1 = this.f_x2; f_y2 = this.f_x1; } else if (writingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) { f_x1 = this.f_y1; f_x2 = this.f_y2; f_y2 = this.f_x1; f_y1 = this.f_x2; } /** put into a Vector */ List values = new ArrayList(); for (int i = 0; i < this.content.length; i++) { if (this.content[i] != null) { // System.out.println(">>>>>"+content[i]); if ((this.colorExtracted) && (this.isXMLExtraction)) { if (!this.content[i].toString().toLowerCase().startsWith(GenericColorSpace.cb)) { this.content[i].insert(0, this.f_colorTag[this.master]); } if (!this.content[i].toString().toLowerCase().endsWith(GenericColorSpace.ce)) { this.content[i].append(GenericColorSpace.ce); } } if (this.isXMLExtraction) values.add((this.content[i]).toString()); else values.add(Strip.convertToText((this.content[i]).toString(), this.isXMLExtraction)); if ((!useUnrotatedCoords) && (writingMode == PdfData.VERTICAL_TOP_TO_BOTTOM)) { values.add(String.valueOf(f_x1[i])); values.add(String.valueOf(f_y1[i])); values.add(String.valueOf(f_x2[i])); values.add(String.valueOf(f_y2[i])); } else if ((!useUnrotatedCoords) && (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP)) { values.add(String.valueOf(f_x1[i])); values.add(String.valueOf(f_y2[i])); values.add(String.valueOf(f_x2[i])); values.add(String.valueOf(f_y1[i])); } else { values.add(String.valueOf(f_x1[i])); values.add(String.valueOf(f_y1[i])); values.add(String.valueOf(f_x2[i])); values.add(String.valueOf(f_y2[i])); } } } LogWriter.writeLog("Text extraction as wordlist completed"); return values; } /** * reset global values */ private void reset() { this.isXHTML = true; this.nextSlot = 0; this.lineBreaks = new Vector_Int(); this.max_rows = 0; this.master = 0; this.colorExtracted = false; } /** * algorithm to place data from specified coordinates on a page into a String. * * @param x1 * is the x coord of the top left corner * @param y1 * is the y coord of the top left corner * @param x2 * is the x coord of the bottom right corner * @param y2 * is the y coord of the bottom right corner * @param page_number * is the page you wish to extract from * @param estimateParagraphs * will attempt to find paragraphs and add new lines in output if true * @param breakFragments * will divide up text based on white space characters if true * @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...) * @throws PdfException * If the co-ordinates are not valid */ final public String extractTextInRectangle(int x1, int y1, int x2, int y2, int page_number, boolean estimateParagraphs, boolean breakFragments) throws PdfException { reset(); if ((breakFragments) && (!this.pdf_data.IsEmbedded())) throw new PdfException( "[PDF] Request to breakfragments and width not added. Please add call to init(true) of PdfDecoder to your code."); /** make sure co-ords valid and throw exception if not */ int[] v = validateCoordinates(x1, y1, x2, y2); x1 = v[0]; y1 = v[1]; x2 = v[2]; y2 = v[3]; int master, count; /** extract the raw fragments (Note order or parameters passed) */ if (breakFragments) copyToArrays(x1, y2, x2, y1, (this.isXMLExtraction), false, false, null, false); else copyToArrays(); /** * delete any hidden text */ removeEncoding(); /** * eliminate shadows and also merge overlapping text */ cleanupShadowsAndDrownedObjects(false); /** get the fragments as an array */ int[] items = getsortedUnusedFragments(true, false); count = items.length; /** * if no values return null */ if (count == 0) { LogWriter.writeLog("Less than 1 text item on page"); return null; } /** * check orientation and get preferred. Items not correct will be ignored */ int writingMode = getWritingMode(items, count); /** * build set of lines from text */ createLines(count, items, writingMode, false, this.isXMLExtraction, false); /** * roll lines together */ master = mergeLinesTogether(writingMode, estimateParagraphs, x1, x2, y1, y2); /** * add final deliminators */ if (this.isXMLExtraction) { this.content[master] = new StringBuilder(Fonts.cleanupTokens(this.content[master].toString())); this.content[master].insert(0, "

"); this.content[master].append("

"); } LogWriter.writeLog("Text extraction completed"); return cleanup(this.content[master]).toString(); } private StringBuilder cleanup(StringBuilder buffer) { if (buffer == null) return buffer; /** if(PdfDecoder.inDemo){ int icount=buffer.length(),count=0; boolean inToken=false; for(int i=0;i') inToken=false; else if((c!=' ')&&(!inToken)){ count++; if(count>4){ count=0; buffer.setCharAt(i,'1'); } } } } /**/ // sort out & to & if (this.isXMLExtraction) { String buf = buffer.toString(); buf = buf.replaceAll("&#", "XX#"); buf = buf.replaceAll("<", "XXlt"); buf = buf.replaceAll(">", "XXgt"); buf = buf.replaceAll("&", "&"); // put back others buf = buf.replaceAll("XX#", "&#"); buf = buf.replaceAll("XXlt", "<"); buf = buf.replaceAll("XXgt", ">"); boolean removeInvalidXMLValues = true; if (removeInvalidXMLValues) { /** * Restricted Char ::= [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F] [#x1-#x8] | [#x11-#x12] | [#x14-#x31] | * [#x127-#x132] | [#x134-#x159] */ /** set mappings */ Map asciiMappings = new HashMap(); /** [#x1-#x8] */ for (int i = 1; i <= 8; i++) asciiMappings.put("&#" + i + ';', ""); /** [#x11-#x12] */ for (int i = 11; i <= 12; i++) asciiMappings.put("&#" + i + ';', ""); /** [#x14-#x31] */ for (int i = 14; i <= 31; i++) asciiMappings.put("&#" + i + ';', ""); /** [#x127-#x132] */ // for (int i = 127; i <= 132; i++) // asciiMappings.put("&#" + i + ";", ""); /** [#x134-#x159] */ // for (int i = 134; i <= 159; i++) // asciiMappings.put("&#" + i + ";", ""); /** substitute illegal XML characters for mapped values */ for (Object o : asciiMappings.keySet()) { String character = (String) o; String mappedCharacter = (String) asciiMappings.get(character); buf = buf.replace(character, mappedCharacter); } } buffer = new StringBuilder(buf); } return buffer; } /** * scan fragments and detect orientation. If multiple, prefer horizontal */ private int getWritingMode(int[] items, int count) { /** * get first value */ int orientation = this.writingMode[items[0]]; // exit if first is horizontal if (orientation == PdfData.HORIZONTAL_LEFT_TO_RIGHT || orientation == PdfData.HORIZONTAL_RIGHT_TO_LEFT) return orientation; /** * scan items looking at orientation - exit if we find horizontal */ for (int j = 1; j < count; j++) { int c = items[j]; if (!this.isUsed[c]) { if (this.writingMode[c] == PdfData.HORIZONTAL_LEFT_TO_RIGHT || this.writingMode[c] == PdfData.HORIZONTAL_RIGHT_TO_LEFT) { orientation = this.writingMode[c]; j = count; LogWriter.writeLog("Text of multiple orientations found. Only horizontal text used."); } } } return orientation; } /** * @param estimateParagraphs * @throws PdfException */ private int mergeLinesTogether(int currentWritingMode, boolean estimateParagraphs, int x1, int x2, int y1, int y2) throws PdfException { String separator; int[] indices; // used for working out alignment int middlePage; /** * create local copies of */ float[] f_x1, f_x2, f_y1, f_y2; if (currentWritingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) { f_x1 = this.f_x1; f_x2 = this.f_x2; f_y1 = this.f_y1; f_y2 = this.f_y2; indices = getsortedUnusedFragments(false, true); middlePage = (x1 + x2) / 2; } else if (currentWritingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) { f_x2 = this.f_x1; f_x1 = this.f_x2; f_y1 = this.f_y1; f_y2 = this.f_y2; indices = getsortedUnusedFragments(false, true); middlePage = (x1 + x2) / 2; } else if (currentWritingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) { f_x1 = this.f_y1; f_x2 = this.f_y2; f_y1 = this.f_x2; f_y2 = this.f_x1; indices = getsortedUnusedFragments(true, true); indices = reverse(indices); middlePage = (y1 + y2) / 2; } else if (currentWritingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) { f_x1 = this.f_y2; f_x2 = this.f_y1; f_y2 = this.f_x2; f_y1 = this.f_x1; indices = getsortedUnusedFragments(true, true); middlePage = (y1 + y2) / 2; } else { throw new PdfException("Illegal value " + currentWritingMode + "for currentWritingMode"); } int quarter = middlePage / 2; int count = indices.length; int master = indices[count - 1]; /** * now loop through all lines merging */ int ClastChar, MlastChar, CFirstChar; final boolean debug = false; for (int i = count - 2; i > -1; i--) { int child = indices[i]; separator = ""; /** add formatting in to retain structure */ // text to see if lasts ends with . and next starts with capital // -1 if no chars ClastChar = getLastChar(this.content[child]); if (debug) { CFirstChar = getFirstChar(this.content[child]); MlastChar = getLastChar(this.content[master]); StringBuilder child_textX = Strip.stripXML(this.content[child], this.isXMLExtraction); String master_textX = Strip.stripXML(this.content[master], this.isXMLExtraction).toString(); } if (ClastChar != -1) { addAlignmentFormatting(estimateParagraphs, middlePage, f_x1, f_x2, quarter, child); // see if we insert a line break and merge String lineSpace = "

" + SystemSeparator + "

"; if (this.isXMLExtraction) lineSpace = SystemSeparator; float gap = f_y2[master] - f_y1[child]; float line_height = f_y1[child] - f_y2[child]; if (currentWritingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) { gap = -gap; line_height = -line_height; } if ((gap > line_height) & (line_height > 0)) { // add in line gaps while (gap > line_height) { separator = separator + lineSpace; gap = gap - line_height; } if (this.isXMLExtraction) separator = separator + "

" + SystemSeparator + "

"; else separator = SystemSeparator; } else if (estimateParagraphs == true) { CFirstChar = getFirstChar(this.content[child]); MlastChar = getLastChar(this.content[master]); if ((((MlastChar == '.')) || (((MlastChar == '\"')))) && ((CFirstChar >= 'A') && (CFirstChar <= 'Z'))) { if (this.isXMLExtraction) separator = "

" + SystemSeparator + "

"; else separator = SystemSeparator; } } else { if (this.isXMLExtraction) { this.content[child].insert(0, "

" + SystemSeparator + "

"); } else this.content[master].append(SystemSeparator); } merge(master, child, separator, false); } } return master; } private int getFirstChar(StringBuilder buffer) { int i = -1; boolean inTag = false; int count = buffer.length(); char openChar = ' '; int ptr = 0; while (ptr < count) { char nextChar = buffer.charAt(ptr); if ((!inTag) && ((nextChar == '<') || (this.isXMLExtraction && nextChar == '&'))) { inTag = true; openChar = nextChar; // trap & .... &xx; or other spurious if ((openChar == '&')) { if ((ptr + 1) == count) { i = '&'; ptr = count; } else { char c = buffer.charAt(ptr + 1); if ((c != '#') && (c != 'g') && (c != 'l')) { i = '&'; ptr = count; } } } } if ((!inTag) && (nextChar != ' ')) { i = nextChar; ptr = count; } // allow for valid & in stream if ((inTag) && (openChar == '&') && (nextChar == ' ')) { i = openChar; ptr = count; } else if ((inTag) && ((nextChar == '>') || (this.isXMLExtraction && openChar == '&' && nextChar == ';'))) { // put back < or > if ((nextChar == ';') && (openChar == '&') && (ptr > 2) & (buffer.charAt(ptr - 1) == 't')) { if ((buffer.charAt(ptr - 2) == 'l')) { i = '<'; ptr = count; } else if ((buffer.charAt(ptr - 2) == 'g')) { i = '>'; ptr = count; } } inTag = false; } ptr++; } return i; } /** return char as int or -1 if no match */ private int getLastChar(StringBuilder buffer) { int i = -1; boolean inTag = false; int count = buffer.length(); int size = count; char openChar = ' '; count--; // knock 1 off so points to last char while (count > -1) { char nextChar = buffer.charAt(count); // trap &xx;; if (inTag && openChar == ';' && nextChar == ';') { i = ';'; count = -1; } if (!inTag && (nextChar == '>' || (this.isXMLExtraction && nextChar == ';'))) { inTag = true; // check it is a token and not just > at end int lastTokenStart = buffer.lastIndexOf("') { inTag = false; ptr = count; } } } if (inTag) openChar = nextChar; else { i = nextChar; count = -1; } } if (!inTag && nextChar != 32) { i = nextChar; count = -1; } if (nextChar == '<' || (this.isXMLExtraction && openChar == ';' && nextChar == '&')) { inTag = false; // put back < or > if ((nextChar == '&') && (count + 3 < size) & (buffer.charAt(count + 2) == 't') && (buffer.charAt(count + 3) == ';')) { if ((buffer.charAt(count + 1) == 'l')) { i = '<'; count = -1; } else if ((buffer.charAt(count + 1) == 'g')) { i = '>'; count = -1; } } } if (inTag && openChar == ';' && nextChar == ' ') { count = -1; i = ';'; } count--; } return i; } /** * reverse order in matrix so back to front */ private static int[] reverse(int[] indices) { int count = indices.length; int[] newIndex = new int[count]; for (int i = 0; i < count; i++) { newIndex[i] = indices[count - i - 1]; } return newIndex; } /** * used to add LEFT,CENTER,RIGHT tags into XML when extracting text */ private void addAlignmentFormatting(boolean estimateParagraphs, int middlePage, float[] f_x1, float[] f_x2, int quarter, int child) { // put in some alignment float left_gap = middlePage - f_x1[child]; float right_gap = f_x2[child] - middlePage; if ((!estimateParagraphs) && (this.isXMLExtraction) && (left_gap > 0) && (right_gap > 0) && (f_x1[child] > quarter) && (f_x1[child] < (middlePage + quarter))) { float ratio = left_gap / right_gap; if (ratio > 1) ratio = 1 / ratio; if (ratio > 0.95) { // add centring if seems centered around middle this.content[child] = new StringBuilder(Fonts.cleanupTokens(this.content[child].toString())); this.content[child].insert(0, "

"); this.content[child].append("
\n"); } else if ((right_gap < 10) & (left_gap > 30)) { // add right align this.content[child] = new StringBuilder(Fonts.cleanupTokens(this.content[child].toString())); this.content[child].insert(0, ""); this.content[child].append("\n"); } } } /** * convert fragments into lines of text */ /** * convert fragments into lines of text */ private void createLines(int count, int[] items, int mode, boolean breakOnSpace, boolean addMultiplespaceXMLTag, boolean sameLineOnly) throws PdfException { String separator; final boolean debug = false; /** * create local copies of arrays */ float[] f_x1, f_x2, f_y1, f_y2; /** * reverse order if text right to left */ if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) items = reverse(items); /** * set pointers so left to right text */ if (mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) { f_x1 = this.f_x1; f_x2 = this.f_x2; f_y1 = this.f_y1; f_y2 = this.f_y2; } else if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) { f_x2 = this.f_x1; f_x1 = this.f_x2; f_y1 = this.f_y1; f_y2 = this.f_y2; } else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) { f_x1 = this.f_y1; f_x2 = this.f_y2; f_y1 = this.f_x2; f_y2 = this.f_x1; } else if (mode == PdfData.VERTICAL_TOP_TO_BOTTOM) { f_x1 = this.f_y2; f_x2 = this.f_y1; f_y2 = this.f_x1; f_y1 = this.f_x2; } else { throw new PdfException("Illegal value " + mode + "for currentWritingMode"); } /** * scan items joining best fit to right of each fragment to build lines. This is tedious and processor intensive but necessary as the order * cannot be guaranteed */ for (int j = 0; j < count; j++) { int id = -1, i; int c = items[j]; float smallest_gap = -1, gap, yMidPt; if (!this.isUsed[c] && this.writingMode[c] == mode) { if (debug) System.out.println("Look for match with " + removeHiddenMarkers(this.content[c].toString())); while (true) { for (int j2 = 0; j2 < count; j2++) { i = items[j2]; if (this.isUsed[i] == false) { // amount of variation in bottom of text int baseLineDifference = (int) (f_y2[i] - f_y2[c]); if (baseLineDifference < 0) baseLineDifference = -baseLineDifference; // amount of variation in bottom of text int topLineDifference = (int) (f_y1[i] - f_y1[c]); if (topLineDifference < 0) topLineDifference = -topLineDifference; // line gap int lineGap = (int) (f_x1[i] - f_x2[c]); // Check if fragments are closer from the other end if (lineGap > (int) (f_x1[c] - f_x2[i])) lineGap = (int) (f_x1[c] - f_x2[i]); int fontSizeChange = this.fontSize[c] - this.fontSize[i]; if (fontSizeChange < 0) fontSizeChange = -fontSizeChange; if (debug) System.out.println("Against " + removeHiddenMarkers(this.content[i].toString())); if (sameLineOnly && lineGap > this.fontSize[c] && lineGap > 0) { // ignore text in wrong order allowing slight margin for // error // allow for multicolumns with gap if (debug) System.out.println("case1 lineGap=" + lineGap); // //Case removed as it broke one file and had no effect on other files // }else if (sameLineOnly && (lineGap > (fontSize[c]*10)|| lineGap > (fontSize[i]*10)) ) { //JUMP IN TEXT SIZE ACROSS // COL // //ignore // // if(debug) // System.out.println("case2"); } else if (sameLineOnly && baseLineDifference > 1 && lineGap > 2 * this.fontSize[c] && (this.fontSize[c] == this.fontSize[i])) { // TEXT SLIGHTLY OFFSET // ignore if (debug) System.out.println("case3"); } else if (sameLineOnly && baseLineDifference > 3) { // ignore if (debug) System.out.println("case4"); } else if (sameLineOnly && fontSizeChange > 2) { // ignore if (debug) System.out.println("case5"); } else if (i != c && ((f_x1[i] > f_x1[c] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM) || f_x1[i] < f_x1[c] && mode == PdfData.VERTICAL_TOP_TO_BOTTOM && this.writingMode[c] == mode && (!(fontSizeChange > 2) || (fontSizeChange > 2 && topLineDifference < 3)))) { // see if // on // right gap = (f_x1[i] - f_x2[c]); if (debug) System.out.println("case6 gap=" + gap); if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) gap = -gap; // allow for fp error if ((gap < 0) && (gap > -2)) gap = 0; // make sure on right yMidPt = (f_y1[i] + f_y2[i]) / 2; // see if line & if only or better fit if ((yMidPt < f_y1[c]) && (yMidPt > f_y2[c]) && ((smallest_gap < 0) || (gap < smallest_gap))) { smallest_gap = gap; id = i; } } } } // merge on next right item or exit when no more matches if (id == -1) break; float possSpace = f_x1[id] - f_x2[c]; if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) possSpace = -possSpace; else if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) possSpace = (f_x2[id] - f_x1[c]); // add space if gap between this and last object separator = isGapASpace(c, id, possSpace, addMultiplespaceXMLTag, mode); /** merge if adjoin */ if ((breakOnSpace) && (this.hadSpace != null) && ((this.hadSpace[c]) || (separator.startsWith(" ")))) break; merge(c, id, separator, true); id = -1; // reset smallest_gap = 1000000; // and reset the gap } } } } static class ResultsComparator implements Comparator { private int rotation; public ResultsComparator(int rotation) { this.rotation = rotation; } @Override public int compare(Object o1, Object o2) { Rectangle[] ra1; Rectangle[] ra2; if (o1 instanceof Rectangle[]) { ra1 = (Rectangle[]) o1; } else ra1 = new Rectangle[] { (Rectangle) o1 }; if (o2 instanceof Rectangle[]) { ra2 = (Rectangle[]) o2; } else ra2 = new Rectangle[] { (Rectangle) o2 }; for (int i = 0; i != ra1.length; i++) for (int j = 0; j != ra2.length; j++) { // do we need this loop? Rectangle r1 = ra1[i]; Rectangle r2 = ra2[j]; switch (this.rotation) { case 0: if (r1.y == r2.y) { // the two words on on the same level so pick the one on the left if (r1.x > r2.x) return 1; else return -1; } else if (r1.y > r2.y) { // the first word is above the second, so pick the first return -1; } return 1;// the second word is above the first, so pick the second case 90: if (r1.x == r2.x) { // the two words on on the same level so pick the one on the left if (r1.y > r2.y) return 1; else return -1; } else if (r1.x > r2.x) // the first word is above the second, so pick the first return 1; return -1; // the second word is above the first, so pick the second case 180: if (r1.y == r2.y) { // the two words on on the same level so pick the one on the left if (r1.x > r2.x) return 1; else return -1; } else if (r1.y > r2.y) { // the first word is above the second, so pick the first return -1; } return 1;// the second word is above the first, so pick the second case 270: if (r1.x == r2.x) { // the two words on on the same level so pick the one on the left if (r1.y > r2.y) return 1; else return -1; } else if (r1.x < r2.x) // the first word is above the second, so pick the first return 1; return -1; // the second word is above the first, so pick the second } // Orginal code kept incase of mistake. // if (rotation == 0 || rotation == 180) { // if (r1.y == r2.y) { // the two words on on the same level so pick the one on the left // if (r1.x > r2.x) // return 1; // else // return -1; // } else if (r1.y > r2.y) { // the first word is above the second, so pick the first // return -1; // } // // return 1; // the second word is above the first, so pick the second // } // else { // rotation == 90 or 270 // if (r1.x == r2.x) { // the two words on on the same level so pick the one on the left // if (r1.y > r2.y) // return 1; // else // return -1; // } else if (r1.x > r2.x) // the first word is above the second, so pick the first // return 1; // // return -1; // the second word is above the first, so pick the second // } } return -1; // the second word is above the first, so pick the second } } // /** * Algorithm to find multiple text terms in x1,y1,x2,y2 rectangle on page_number, with matching teaser * * @param x1 * the left x cord * @param y1 * the upper y cord * @param x2 * the right x cord * @param y2 * the lower y cord * @param rotation * the rotation of the page to be searched * @param page_number * the page number to search on * @param terms * the terms to search for * @param searchType * searchType the search type made up from one or more constants obtained from the SearchType class * @param listener * an implementation of SearchListener is required, this is to enable searching to be cancelled * @return a SortedMap containing a collection of Rectangle describing the location of found text, mapped to a String which is the matching teaser * @throws PdfException * If the co-ordinates are not valid */ public SortedMap findMultipleTermsInRectangleWithMatchingTeasers(int x1, int y1, int x2, int y2, final int rotation, int page_number, String[] terms, int searchType, SearchListener listener) throws PdfException { this.usingMultipleTerms = true; this.multipleTermTeasers.clear(); this.teasers = null; boolean origIncludeTease = this.includeTease; this.includeTease = true; List highlights = findMultipleTermsInRectangle(x1, y1, x2, y2, page_number, terms, searchType, listener); SortedMap highlightsWithTeasers = new TreeMap(new ResultsComparator(rotation)); for (int i = 0; i < highlights.size(); i++) { /* highlights.get(i) is a rectangle or a rectangle[] */ highlightsWithTeasers.put(highlights.get(i), this.multipleTermTeasers.get(i)); } this.usingMultipleTerms = false; this.includeTease = origIncludeTease; return highlightsWithTeasers; } // /** * Algorithm to find multiple text terms in x1,y1,x2,y2 rectangle on page_number. * * @param x1 * the left x cord * @param y1 * the upper y cord * @param x2 * the right x cord * @param y2 * the lower y cord * @param rotation * the rotation of the page to be searched * @param page_number * the page number to search on * @param terms * the terms to search for * @param orderResults * if true the list that is returned is ordered to return the resulting rectangles in a logical order descending down the page, if * false, rectangles for multiple terms are grouped together. * @param searchType * searchType the search type made up from one or more constants obtained from the SearchType class * @param listener * an implementation of SearchListener is required, this is to enable searching to be cancelled * @return a list of Rectangle describing the location of found text * @throws PdfException * If the co-ordinates are not valid */ public List findMultipleTermsInRectangle(int x1, int y1, int x2, int y2, final int rotation, int page_number, String[] terms, boolean orderResults, int searchType, SearchListener listener) throws PdfException { this.usingMultipleTerms = true; this.multipleTermTeasers.clear(); this.teasers = null; List highlights = findMultipleTermsInRectangle(x1, y1, x2, y2, page_number, terms, searchType, listener); if (orderResults) { Collections.sort(highlights, new ResultsComparator(rotation)); } this.usingMultipleTerms = false; return highlights; } private List findMultipleTermsInRectangle(int x1, int y1, int x2, int y2, int page_number, String[] terms, int searchType, SearchListener listener) throws PdfException { List list = new ArrayList(); for (String term : terms) { if (listener != null && listener.isCanceled()) { // System.out.println("RETURNING EARLY"); break; } float[] co_ords; co_ords = findText(new Rectangle(x1, y1, x2, y2), page_number, new String[] { term }, searchType); if (co_ords != null) { int count = co_ords.length; for (int ii = 0; ii < count; ii = ii + 5) { int wx1 = (int) co_ords[ii]; int wy1 = (int) co_ords[ii + 1]; int wx2 = (int) co_ords[ii + 2]; int wy2 = (int) co_ords[ii + 3]; Rectangle rectangle = new Rectangle(wx1, wy2, wx2 - wx1, wy1 - wy2); int seperator = (int) co_ords[ii + 4]; if (seperator == this.linkedSearchAreas) { Vector_Rectangle vr = new Vector_Rectangle(); vr.addElement(rectangle); while (seperator == this.linkedSearchAreas) { ii = ii + 5; wx1 = (int) co_ords[ii]; wy1 = (int) co_ords[ii + 1]; wx2 = (int) co_ords[ii + 2]; wy2 = (int) co_ords[ii + 3]; seperator = (int) co_ords[ii + 4]; rectangle = new Rectangle(wx1, wy2, wx2 - wx1, wy1 - wy2); vr.addElement(rectangle); } vr.trim(); list.add(vr.get()); } else { list.add(rectangle); } } } } return list; } // /** * Method to find text in the specified area allowing for the text to be split across multiple lines.
* * @param searchArea * = Area on page to search. If null search whole page * @param page_number * = the current page to search * @param terms * = the text to search for * @param searchType * = info on how to search the pdf * @return the coords of the found text in a float[] where the coords are pdf page coords. The origin of the coords is the bottom left hand corner * (on unrotated page) organised in the following order.
[0]=result x1 coord
[1]=result y1 coord
[2]=result x2 coord
* [3]=result y2 coord
[4]=either -101 to show that the next text area is the remainder of this word on another line else any other * value is ignored.
* @throws PdfException */ final public float[] findText(Rectangle searchArea, int page_number, String[] terms, int searchType) throws PdfException { // Failed to supply search terms to do nothing if (terms == null) return new float[] {}; // Flags to control the different search options boolean firstOccuranceOnly = false; boolean wholeWordsOnly = false; boolean foundFirst = false; boolean useRegEx = false; // Search result and teaser holders Vector_Float resultCoords = new Vector_Float(0); Vector_String resultTeasers = new Vector_String(0); // Extract the text data into local arrays for searching copyToArrays(); // Remove any hidden text on page as should not be found cleanupShadowsAndDrownedObjects(false); // Get unused text objects and sort them for correct searching int[] items = getsortedUnusedFragments(true, false); /** * check orientation and get preferred. Items not correct will be ignored */ int l2r = 0; int r2l = 0; int t2b = 0; int b2t = 0; for (int i = 0; i != items.length; i++) { switch (this.writingMode[items[i]]) { case 0: l2r++; break; case 1: r2l++; break; case 2: t2b++; break; case 3: b2t++; break; } } int[] unsorted = new int[] { l2r, r2l, t2b, b2t }; int[] sorted = new int[] { l2r, r2l, t2b, b2t }; // Set all to -1 so we can tell if it's been set yet int[] writingModes = new int[] { -1, -1, -1, -1 }; Arrays.sort(sorted); for (int i = 0; i != unsorted.length; i++) { for (int j = 0; j < sorted.length; j++) { if (unsorted[i] == sorted[j]) { int pos = j - 3; if (pos < 0) pos = -pos; if (writingModes[pos] == -1) { writingModes[pos] = i; j = sorted.length; } } } } for (int u = 0; u != writingModes.length; u++) { int writingMode = writingModes[u]; // if not lines for writing mode, ignore if (unsorted[writingMode] != 0) { // Merge text fragments into lines as displayed on page createLines(items.length, items, writingMode, true, false, true); // Bitwise flags for regular expressions engine, options always required int options = 0; // Turn on case sensitive mode if ((searchType & SearchType.CASE_SENSITIVE) != SearchType.CASE_SENSITIVE) { options = (options | Pattern.CASE_INSENSITIVE); } // Only find first occurance of each search term if ((searchType & SearchType.FIND_FIRST_OCCURANCE_ONLY) == SearchType.FIND_FIRST_OCCURANCE_ONLY) { firstOccuranceOnly = true; } // Only find whole words, not partial words if ((searchType & SearchType.WHOLE_WORDS_ONLY) == SearchType.WHOLE_WORDS_ONLY) { wholeWordsOnly = true; } // Allow search to find split line results if ((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS) { options = (options | Pattern.MULTILINE | Pattern.DOTALL); } // Allow the use of regular expressions symbols if ((searchType & SearchType.USE_REGULAR_EXPRESSIONS) == SearchType.USE_REGULAR_EXPRESSIONS) { useRegEx = true; } /** * create local copies of arrays */ float[] f_y1 = this.f_y1, f_y2 = this.f_y2; /** * swap around x and y so rountine works on all cases */ boolean valuesSwapped = false; if (writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) { f_y1 = this.f_y1; f_y2 = this.f_y2; } else if (writingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) { f_y1 = this.f_y1; f_y2 = this.f_y2; } else if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) { f_y1 = this.f_x2; f_y2 = this.f_x1; valuesSwapped = true; } else if (writingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) { f_y2 = this.f_x1; f_y1 = this.f_x2; valuesSwapped = true; } // Portions of text to perform the search on and find teasers String[] searchText; String[] coordsText; // Merge all text into one with \n line separators // This will allow checking for multi line split results String plain = ""; String raw = ""; for (int i = 0; i != this.content.length; i++) { if (this.content[i] != null && writingMode == this.writingMode[i]) { raw += this.content[i] + "\n"; plain += this.content[i] + "\n"; } } // Remove double spaces, replacing them with single spaces raw = removeDuplicateSpaces(raw); plain = removeDuplicateSpaces(plain); // Strip xml from content and keep coords and text data raw = Strip.stripXML(raw, this.isXMLExtraction).toString(); // Strip xml and coords data from content and keep text data plain = removeHiddenMarkers(plain); plain = Strip.stripXML(plain, this.isXMLExtraction).toString(); // Store text in the search and teaser arrays searchText = new String[] { plain }; coordsText = new String[] { raw }; // Hold starting point data at page rotation Point resultStart; // Work through the search terms one at a time for (int j = 0; j != terms.length; j++) { String searchValue = terms[j]; // Set the default separator between words in a search term String sep = " "; // Multiline needs space or newline to be recognised as word separators if ((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS) { sep = "[ \\\\n]"; } // if not using reg ex add reg ex literal flags around the text and word separators if (!useRegEx) { searchValue = "\\Q" + searchValue + "\\E"; sep = "\\\\E" + sep + "\\\\Q"; } // If word seperator has changed, replace all spaces with modified seperator if (!sep.equals(" ")) { searchValue = searchValue.replaceAll(" ", sep); } // Surround search term with word boundry tags to match whole words if (wholeWordsOnly) searchValue = "\\b" + searchValue + "\\b"; // Create pattern to match search term Pattern searchTerm = Pattern.compile(searchValue, options); // Create pattern to match search term with two words before and after Pattern teaserTerm = Pattern.compile("(?:\\S+\\s)?\\S*(?:\\S+\\s)?\\S*" + searchValue + "\\S*(?:\\s\\S+)?\\S*(?:\\s\\S+)?", options); // Loop through all search text for (int i = 0; i != searchText.length; i++) { // Get text data and text+coord data String plainText = searchText[i]; String coordText = coordsText[i]; // So long as text data is not null if (plainText != null) { // Create two matchers for finding search term and teaser Matcher termFinder = searchTerm.matcher(plainText); Matcher teaserFinder = teaserTerm.matcher(plainText); boolean needToFindTeaser = true; // Keep looping till no result is returned while (termFinder.find()) { resultStart = null; // Make note of the text found and index in the text String foundTerm = termFinder.group(); int termStarts = termFinder.start(); int termEnds = termFinder.end() - 1; // If storing teasers if (this.includeTease) { // Store the term found as a default value String teaser = foundTerm; if (this.includeHTMLtags) teaser = "" + teaser + ""; boolean itemFound = false; if (needToFindTeaser) { itemFound = teaserFinder.find(); } if (itemFound) { // Get a teaser if found and set the search term to bold is allowed if (teaserFinder.start() < termStarts && teaserFinder.end() > termEnds) { // replace default with found teaser teaser = teaserFinder.group(); if (this.includeHTMLtags) { // Calculate points to add bold tags int teaseStarts = termStarts - teaserFinder.start(); int teaseEnds = (termEnds - teaserFinder.start()) + 1; // Add bold tags teaser = teaser.substring(0, teaseStarts) + "" + teaser.substring(teaseStarts, teaseEnds) + "" + teaser.substring(teaseEnds, teaser.length()); } needToFindTeaser = true; } else { needToFindTeaser = false; } } // Store teaser resultTeasers.addElement(teaser); } // Get coords of found text for highlights float currentX; float width; // Track point in text data line (without coord data) int pointInLine = -1; // Track line on page int lineCounter = 0; // Skip null values and value not in the correct writing mode to ensure correct result coords while (this.content[lineCounter] == null || writingMode != this.writingMode[lineCounter]) lineCounter++; // Flags used to catch if result is split accross lines boolean startFound = false; boolean endFound = false; // Cycle through coord text looking for coords of this result // Ignore first value as it is known to be the first marker for (int pointer = 1; pointer < coordText.length(); pointer++) { // find second marker and get x coord int startPointer = pointer; while (pointer < coordText.length()) { if (coordText.charAt(pointer) == MARKER2) break; pointer++; } // Convert text to float value for x coord currentX = Float.parseFloat(coordText.substring(startPointer, pointer)); pointer++; // find third marker and get width startPointer = pointer; while (pointer < coordText.length()) { if (coordText.charAt(pointer) == MARKER2) break; pointer++; } // Convert text to float value for character width width = Float.parseFloat(coordText.substring(startPointer, pointer)); pointer++; // find fourth marker and get text (character) startPointer = pointer; while (pointer < coordText.length()) { if (coordText.charAt(pointer) == MARKER2) break; pointer++; } // Store text to check for newline character later String text = coordText.substring(startPointer, pointer); pointInLine += text.length(); // Start of term not found yet. // Point in line is equal to or greater than start of the term. // Store coords and mark start as found. if (!startFound && pointInLine >= termStarts) { resultStart = new Point((int) currentX, (int) f_y1[lineCounter]); startFound = true; } // End of term not found yet. // Point in line is equal to or greater than end of the term. // Store coords and mark end as found. if (!endFound && pointInLine >= termEnds) { if (valuesSwapped) { if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) { resultCoords.addElement((int) f_y2[lineCounter]); resultCoords.addElement((int) currentX + width); resultCoords.addElement(resultStart.y); resultCoords.addElement(resultStart.x); resultCoords.addElement(0.0f); } else { resultCoords.addElement((int) f_y2[lineCounter]); resultCoords.addElement(resultStart.x); resultCoords.addElement(resultStart.y); resultCoords.addElement((int) currentX + width); resultCoords.addElement(0.0f); } } else { resultCoords.addElement(resultStart.x); resultCoords.addElement(resultStart.y); resultCoords.addElement(currentX + width); resultCoords.addElement(f_y2[lineCounter]); resultCoords.addElement(0.0f); } endFound = true; } // Using multi line option. // Start of term found. // End of term not found. // New line character found. // Set up multi line result. if (startFound && !endFound && text.contains("\n")) { // Set ends coords if (valuesSwapped) { if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) { resultCoords.addElement((int) f_y2[lineCounter]); resultCoords.addElement((int) currentX + width); resultCoords.addElement(resultStart.y); resultCoords.addElement(resultStart.x); resultCoords.addElement(this.linkedSearchAreas); // Mark next result as linked } else { resultCoords.addElement((int) f_y2[lineCounter]); resultCoords.addElement(resultStart.x); resultCoords.addElement(resultStart.y); resultCoords.addElement((int) currentX + width); resultCoords.addElement(this.linkedSearchAreas); // Mark next result as linked } } else { resultCoords.addElement(resultStart.x); resultCoords.addElement(resultStart.y); resultCoords.addElement(currentX + width); resultCoords.addElement(f_y2[lineCounter]); resultCoords.addElement(this.linkedSearchAreas); // Mark next result as linked } // Set start of term as not found startFound = false; // Set this point in line as start of next term // Guarantees next character is found as // start of the next part of the search term termStarts = pointInLine; } // In multiline mode we progress the line number when we find a \n // This is to allow the correct calculation of y coords if (text.contains("\n")) { lineCounter++; // If current content pointed at is null or not the correct writing mode, skip value until data is found while (lineCounter < this.content.length && (this.content[lineCounter] == null || writingMode != this.writingMode[lineCounter])) { lineCounter++; } } } // If only finding first occurance, // Stop searching this text data for search term. if (firstOccuranceOnly) { foundFirst = true; break; } } // If only finding first occurance and first is found, // Stop searching all text data for this search term. if (firstOccuranceOnly && foundFirst) { break; } } } } // Remove any trailing empty values resultCoords.trim(); // If including tease values if (this.includeTease) { // Remove any trailing empty values resultTeasers.trim(); // Store teasers so they can be retrieved by different search methods if (this.usingMultipleTerms) { // Store all teasers for so they may be returned as a sorted map // Only used for one method controled by the above flag for (int i = 0; i != resultTeasers.size(); i++) this.multipleTermTeasers.add(resultTeasers.elementAt(i)); } else { // Store all teasers to be retrieved by getTeaser() method this.teasers = resultTeasers.get(); } } } } // Return coord data for search results return resultCoords.get(); } private static String removeDuplicateSpaces(String textValue) { if (textValue.contains(" ")) { textValue = textValue.replace(" ", " "); } return textValue; } /** return endpoints from last findtext */ public float[] getEndPoints() { return this.endPoints; } /** * return text teasers from findtext if generateTeasers() called before find */ public String[] getTeasers() { return this.teasers; } /** * tell find text to generate teasers as well */ public void generateTeasers() { this.includeTease = true; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy