All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jpedal.grouping.PdfGroupingAlgorithms Maven / Gradle / Ivy

/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/support/
 *
 * (C) Copyright 1997-2016 IDRsolutions and Contributors.
 *
 * This file is part of JPedal/JPDF2HTML5
 *
     This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


 *
 * ---------------
 * PdfGroupingAlgorithms.java
 * ---------------
 */
package org.jpedal.grouping;

import java.awt.Rectangle;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jpedal.color.GenericColorSpace;
import org.jpedal.exception.PdfException;
import org.jpedal.objects.PdfData;
import org.jpedal.utils.Fonts;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Sorts;
import org.jpedal.utils.Strip;
import org.jpedal.utils.repositories.*;
import org.jpedal.utils.repositories.generic.Vector_Rectangle_Int;

/**
 * Applies heuristics to unstructured PDF text to create content
 */
public class PdfGroupingAlgorithms {
    
    private boolean includeHTMLtags;
    
    private static final String SystemSeparator = System.getProperty("line.separator");
    
	//flag to show this item has been merged into another and should be ignored.
	private boolean[] isUsed;

	//co-ords of object (x1,y1 is top left)
	private float[] f_x1, f_x2, f_y1, f_y2;
	
	//track if we removed space from end
	private boolean[] hadSpace;
	
	//hold colour info
	private String[] f_colorTag;
	
	//hold writing mode
	private int[] writingMode;
	
	//font sizes in pixels
	private int[] fontSize;

	//amount of space a space uses in this font/size
	private float[] spaceWidth;

	//actual text
	private StringBuilder[] content;

	//raw number of text characters
	private int[] textLength;

    //Hold data from pdf so we can create local version
	private final PdfData pdf_data;

	//flag to show if output for table is CSV or XHTML
	private boolean isXHTML = true;

	//slot to insert next value - used when we split fragments for table code
	private int nextSlot;

	//vertical breaks for table calculation
	private Vector_Int lineBreaks = new Vector_Int();

	//holds details as we scan lines for table
	private Vector_Object lines;

	//lookup table used to sort into correct order for table
	private Vector_Int lineY2;

	//marker char used in content (we bury location for each char so we can split)
	private static final String MARKER = PdfData.marker;
	public static final char MARKER2= MARKER.charAt(0);

	//counters for cols and rows and pointer to final object we merge into
	private int max_rows, master;
	
	//flag to show color info is being extracted
	private boolean colorExtracted;
	
	//used to calculate correct order for table lines
	private int[] line_order;

	//amount we resize arrays holding content with if no space
    private static final int increment = 100;

	public static boolean useUnrotatedCoords;

	//flag to show if tease created on findText
	private boolean includeTease;

	//teasers for findtext
	private String[] teasers;

	private final List multipleTermTeasers = new ArrayList();

	private boolean usingMultipleTerms;

    private boolean isXMLExtraction=true;

	//Value placed between result areas to show they are part of the same result
	private static final int linkedSearchAreas=-101;
	
	/**
     * Create a new instance, passing in raw data
     * @param pdf_data PdfData from the pdf to search
     * @param isXMLExtraction Boolean flag to specify if output should be xml
     */
	public PdfGroupingAlgorithms(final PdfData pdf_data, final boolean isXMLExtraction) {
		this.pdf_data = pdf_data;
        this.isXMLExtraction=isXMLExtraction;
		colorExtracted=pdf_data.isColorExtracted();
    }
	
	/**
	 * workout if we should use space, CR or no separator when joining lines
	 */
    private static String getLineDownSeparator(final StringBuilder rawLine1, final StringBuilder rawLine2, final boolean isXMLExtraction) {

		String returnValue = " "; //space is default

		final boolean hasUnderline = false;

		//get 2 lines without any XML or spaces so we can look at last char
        final StringBuilder line1;
        final StringBuilder line2;
        if(isXMLExtraction){
			line1 = Strip.stripXML(rawLine1,isXMLExtraction);
			line2 = Strip.stripXML(rawLine2,isXMLExtraction);
		}else{
			line1 = Strip.trim(rawLine1);
			line2 = Strip.trim(rawLine2);
		}
		
		//get lengths and if appropriate perform tests
		final int line1Len = line1.length();
		final int line2Len = line2.length();
		
		if((line1Len>1)&&(line2Len>1)){

			//get chars to test
			final char line1Char2 = line1.charAt(line1Len - 1);
			final char line1Char1 = line1.charAt(line1Len - 2);
			final char line2Char1 = line2.charAt(0);
			final char line2Char2 = line2.charAt(1);

			//deal with hyphenation first - ignore unless :- or space-
            final String hyphen_values = "";
            if (hyphen_values.indexOf(line1Char2) != -1) {
				returnValue = ""; //default of nothing
				if (line1Char1 == ':') {
                    returnValue = "\n";
                }
				if (line1Char2 == ' ') {
                    returnValue = " ";
                }
                //paragraph breaks if full stop and next line has ascii char or Capital Letter
            } else if (
				((line1Char1 == '.') || (line1Char2 == '.'))
					&& (Character.isUpperCase(line2Char1)
						|| (line2Char1 == '&')
						|| Character.isUpperCase(line2Char2)
						|| (line2Char2 == '&'))){
				if(isXMLExtraction) {
                    returnValue = "

\n"; } else { returnValue="\n"; } } } //add an underline if appropriate if (hasUnderline){ if(isXMLExtraction) { returnValue += "

\n"; } else { returnValue += '\n'; } } return returnValue; } /** * remove shadows from text created by double printing of text and drowned * items where text inside other text */ private void cleanupShadowsAndDrownedObjects(final boolean avoidSpaces) { //get list of items final int[] items = getUnusedFragments(); final int count = items.length; int c, n; String separator; float diff; //work through objects and eliminate shadows or roll together overlaps for (int p = 0; p < count; p++) { //master item c = items[p]; //ignore used items if (!isUsed[c]) { //work out mid point in text float midX = (f_x1[c] + f_x2[c]) / 2; float midY = (f_y1[c] + f_y2[c]) / 2; for (int p2 = p + 1;p2 < count;p2++) { //item to test against n = items[p2]; //Ignore fragments that have been used or have no width if ((f_x1[n] != f_x2[n]) && (!isUsed[n]) && (!isUsed[c])) { float fontDiff = this.fontSize[n] - fontSize[c]; if (fontDiff < 0) { fontDiff = -fontDiff; } diff = (f_x2[n] - f_x1[n]) - (f_x2[c] - f_x1[c]); if(diff<0) { diff=-diff; } //stop spurious matches on overlapping text if (fontDiff==0 && (midX > f_x1[n])&& (midX < f_x2[n]) && (diff< 10) && (midY < f_y1[n])&& (midY > f_y2[n])) { isUsed[n] = true; //pick up drowned text items (item inside another) } else { final boolean a_in_b = (f_x1[n] > f_x1[c])&& (f_x2[n] < f_x2[c]) && (f_y1[n] < f_y1[c])&& (f_y2[n] > f_y2[c]); final boolean b_in_a = (f_x1[c] > f_x1[n])&& (f_x2[c] < f_x2[n]) && (f_y1[c] < f_y1[n])&& (f_y2[c] > f_y2[n]); //merge together if (a_in_b || b_in_a) { //get order right - bottom y2 underneath if (f_y2[c] > f_y2[n]) { separator =getLineDownSeparator(content[c],content[n],isXMLExtraction); if((!avoidSpaces)||(separator.indexOf(' ')==-1)){ merge(c,n,separator,true); } } else { separator =getLineDownSeparator(content[n],content[c],isXMLExtraction); if(!avoidSpaces || separator.indexOf(' ')==-1){ merge(n,c,separator,true); } } //recalculate as may have changed midX = (f_x1[c] + f_x2[c]) / 2; midY = (f_y1[c] + f_y2[c]) / 2; } } } } } } } /** * general routine to see if we add a space between 2 text fragments */ private String isGapASpace(final int c, final int l, final float actualGap, final boolean addMultiplespaceXMLTag, final int writingMode) { String sep = ""; float gap; //use smaller gap final float gapA = spaceWidth[c] * fontSize[c]; final float gapB = spaceWidth[l] * fontSize[l]; if (gapA > gapB) { gap = gapB; } else { gap = gapA; } gap = (actualGap / (gap / 1000)); //Round values to closest full integer as float -> int conversion rounds down if(gap > 0.51f && gap<1) { gap = 1; } final int spaceCount = (int) gap; if (spaceCount > 0) { sep = " "; } //add an XML tag to flag multiple spaces if (spaceCount > 1 && addMultiplespaceXMLTag && writingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT) { sep = " "; } return sep; } /** * merge 2 text fragments together and update co-ordinates */ private void merge(final int m, final int c, final String separator, final boolean moveFont) { //update co-ords if (f_x1[m] > f_x1[c]) { f_x1[m] = f_x1[c]; } if (f_y1[m] < f_y1[c]) { f_y1[m] = f_y1[c]; } if (f_x2[m] < f_x2[c]) { f_x2[m] = f_x2[c]; } if (f_y2[m] > f_y2[c]) { f_y2[m] = f_y2[c]; } if(isXMLExtraction){ String test=Fonts.fe; //add color tag if needed and changes if(colorExtracted) { test=Fonts.fe+GenericColorSpace.ce; } //move if needed and add separator if ((moveFont) && (content[m].toString().lastIndexOf(test)!=-1)) { final String master = content[m].toString(); content[m] =new StringBuilder(master.substring(0, master.lastIndexOf(test))); content[m].append(separator); content[m].append(master.substring(master.lastIndexOf(test))); } else{ content[m].append(separator); } //Only map out space if text length is longer than 1 if(textLength[c]>1 && content[m].toString().endsWith(" ")){ content[m].deleteCharAt(content[m].lastIndexOf(" ")); } //use font size of second text (ie at end of merged text) fontSize[m] = fontSize[c]; //Remove excess / redundent xml tags if((content[c].indexOf("", content[m].lastIndexOf("")+7==content[m].lastIndexOf(">"))){ content[c].replace(content[c].indexOf("")+1, ""); content[m].replace(content[m].lastIndexOf(""), content[m].lastIndexOf("")+8, ""); } if((content[c].indexOf("",content[m].lastIndexOf("")+6==content[m].lastIndexOf(">"))){ content[c].replace(content[c].indexOf("")+1, ""); content[m].replace(content[m].lastIndexOf(""), content[m].lastIndexOf("")+7, ""); } content[m] = content[m].append(content[c]); //track length of text less all tokens textLength[m] += textLength[c]; //set objects to null to flush and log as used isUsed[c] = true; content[c] = null; }else{ //use font size of second text (ie at end of merged text) fontSize[m] = fontSize[c]; //add together content[m] = content[m].append(separator).append(content[c]); //track length of text less all tokens textLength[m] += textLength[c]; //set objects to null to flush and log as used isUsed[c] = true; content[c] = null; } } /** * remove width data we may have buried in data */ private void removeEncoding() { // get list of items final int[] items = getUnusedFragments(); int current; // work through objects and eliminate shadows or roll together overlaps for (final int item : items) { // master item current = item; // ignore used items and remove widths we hid in data if (!isUsed[current]) { content[current] = removeHiddenMarkers(current); } } } /** * put raw data into Arrays for quick merging breakup_fragments shows if we * break on vertical lines and spaces */ private void copyToArraysPartial(final int minX, final int minY, final int maxX, final int maxY) { colorExtracted=pdf_data.isColorExtracted(); final int count = pdf_data.getRawTextElementCount(); //local lists for faster access //final boolean[] isUsed = new boolean[count]; final int[] fontSize = new int[count]; final int[] writingMode=new int[count]; final float[] spaceWidth = new float[count]; final StringBuilder[] content = new StringBuilder[count]; final int[] textLength = new int[count]; final float[] f_x1 = new float[count]; final String[] f_colorTag=new String[count]; final float[] f_x2 = new float[count]; final float[] f_y1 = new float[count]; final float[] f_y2 = new float[count]; float x1,x2,y1,y2; int currentPoint = 0; //set values for (int i = 0; i < count; i++) { //extract values x1 = pdf_data.f_x1[i]; x2 = pdf_data.f_x2[i]; y1 = pdf_data.f_y1[i]; y2 = pdf_data.f_y2[i]; final int mode=pdf_data.f_writingMode[i]; boolean accepted = false; float height; switch (mode) { case PdfData.HORIZONTAL_LEFT_TO_RIGHT: case PdfData.HORIZONTAL_RIGHT_TO_LEFT: height = y1-y2; if ((((minX < x1 && x1 < maxX) || (minX < x2 && x2 < maxX)) || //Area contains the x1 or x2 coords ((x1 < minX && minX < x2) || (x1 < maxX && maxX < x2)) //Area is within the x1 and x2 coords ) && (minY < y2 + (height / 4) && y2 + (height * 0.75) < maxY) //Area also contains atleast 3/4 of the text y coords ) { accepted = true; } break; case PdfData.VERTICAL_BOTTOM_TO_TOP: case PdfData.VERTICAL_TOP_TO_BOTTOM: height = x2-x1; if ((((minY < y1 && y1 < maxY) || (minY < y2 && y2 < maxY)) || //Area contains the x1 or x2 coords ((y2 < minY && minY < y1) || (y2 < maxY && maxY < y1)) //Area is within the x1 and x2 coords ) && (minX < x1 + (height / 4) && x1 + (height * 0.75) < maxX) //Area also contains atleast 3/4 of the text y coords ) { accepted = true; } break; } //if at least partly in the area, process if(accepted){ content[currentPoint] = new StringBuilder(pdf_data.contents[i]); fontSize[currentPoint] = pdf_data.f_end_font_size[i]; writingMode[currentPoint]=pdf_data.f_writingMode[i]; f_x1[currentPoint] = pdf_data.f_x1[i]; f_colorTag[currentPoint]=pdf_data.colorTag[i]; f_x2[currentPoint] = pdf_data.f_x2[i]; f_y1[currentPoint] = pdf_data.f_y1[i]; f_y2[currentPoint] = pdf_data.f_y2[i]; spaceWidth[currentPoint] = pdf_data.space_width[i]; textLength[currentPoint] = pdf_data.text_length[i]; StringBuilder startTags = new StringBuilder(content[currentPoint].toString().substring(0, content[currentPoint].toString().indexOf(MARKER))); final String contentText = content[currentPoint].toString().substring(content[currentPoint].toString().indexOf(MARKER), content[currentPoint].toString().indexOf('<', content[currentPoint].toString().lastIndexOf(MARKER))); String endTags = content[currentPoint].toString().substring(content[currentPoint].toString().lastIndexOf(MARKER)); //Skips last section of text endTags = endTags.substring(endTags.indexOf('<')); final StringTokenizer tokenizer = new StringTokenizer(contentText, MARKER); boolean setX1 = true; float width = 0; while(tokenizer.hasMoreTokens()){ String token = tokenizer.nextToken(); final float xCoord = (Float.parseFloat(token)); token = tokenizer.nextToken(); width = Float.parseFloat(token); token = tokenizer.nextToken(); final String character = token; if(setX1){ if ((mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)){ f_x1[currentPoint] = xCoord; }else{ f_y2[currentPoint] = xCoord; } setX1 = false; } if ((mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)){ f_x2[currentPoint] = xCoord; }else{ f_y1[currentPoint] = xCoord; } boolean storeValues = false; if ((mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)){ if(minX