All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jpedal.grouping.PdfSearchUtils Maven / Gradle / Ivy

There is a newer version: 7.15.25
Show newest version
/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/support/
 *
 * (C) Copyright 1997-2017 IDRsolutions and Contributors.
 *
 * This file is part of JPedal/JPDF2HTML5
 *
 @LICENSE@
 *
 * ---------------
 * PdfSearchUtils.java
 * ---------------
 */
package org.jpedal.grouping;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jpedal.exception.PdfException;

import static org.jpedal.grouping.PdfGroupingAlgorithms.removeHiddenMarkers;

import org.jpedal.objects.PdfData;
import org.jpedal.utils.Fonts;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Strip;
import org.jpedal.utils.repositories.Vector_Float;
import org.jpedal.utils.repositories.Vector_String;

public class PdfSearchUtils {

    private boolean includeHTMLtags;

    private final List multipleTermTeasers = new ArrayList();

    //Hold data from pdf so we can create local version
    private final PdfData pdf_data;

    private Line[] fragments;
    private Line[] lines;

    //Value placed between result areas to show they are part of the same result
    private static final int MULTIPLE_AREA_RESULT = -101;

    private boolean includeTease;

    protected PdfSearchUtils(final PdfData pdf_data) {
        this.pdf_data = pdf_data;
    }

    /**
     * Search a particular area with in pdf page currently loaded and return the areas
     * of the results found as an array of float values.
     *
     * @param x1         is the x coord of the top left corner
     * @param y1         is the y coord of the top left corner
     * @param x2         is the x coord of the bottom right corner
     * @param y2         is the y coord of the bottom right corner
     * @param terms      : String[] of search terms, each String is treated as a single term
     * @param searchType : int containing bit flags for the search (See class SearchType)
     * @return the coords of the found text in a float[] where the coords are pdf page coords.
     * The origin of the coords is the bottom left hand corner (on unrotated page) organised in the following order.
* [0]=result x1 coord
* [1]=result y1 coord
* [2]=result x2 coord
* [3]=result y2 coord
* [4]=either -101 to show that the next text area is the remainder of this word on another line else any other value is ignored.
* @throws PdfException if the page content being search contains invalid data that the search can not recover from */ @SuppressWarnings("UnusedParameters") protected final float[] findText( int x1, int y1, int x2, int y2, final String[] terms, final int searchType) throws PdfException { //Failed to supply search terms to do nothing if (terms == null) { return new float[]{}; } //Search result and teaser holders final Vector_Float resultCoords = new Vector_Float(0); final Vector_String resultTeasers = new Vector_String(0); //make sure co-ords valid and throw exception if not final int[] v = validateCoordinates(x1, y1, x2, y2); x1 = v[0]; y1 = v[1]; x2 = v[2]; y2 = v[3]; //Extract the text data into local arrays for searching copyToArraysPartial(x1, y2, x2, y1); //Remove any hidden text on page as should not be found cleanupShadowsAndDrownedObjects(false); //Get unused text objects and sort them for correct searching final Line[] localLines = fragments.clone(); final int[] unsorted = getWritingModeCounts(localLines); final int[] writingModes = getWritingModeOrder(unsorted); for (int u = 0; u != writingModes.length; u++) { final int mode = writingModes[u]; //if not lines for writing mode, ignore if (unsorted[mode] != 0) { searchWritingMode(mode, searchType, terms, resultCoords, resultTeasers); } } //Return coord data for search results return resultCoords.get(); } /** * return text teasers from findtext if generateTeasers() called before find * * @return String[] representing teasers for each result (single of linked areas) in result order */ protected String[] getTeasers() { return multipleTermTeasers.toArray(new String[multipleTermTeasers.size()]); } /** * put raw data into Arrays for quick merging breakup_fragments shows if we * break on vertical lines and spaces */ private void copyToArraysPartial(final int minX, final int minY, final int maxX, final int maxY) { final int count = pdf_data.getRawTextElementCount(); final Line[] localFragments = new Line[count]; int currentPoint = 0; final String marker = PdfData.marker; //set values for (int i = 0; i < count; i++) { //if at least partly in the area, process if (isFragmentWithinArea(pdf_data, i, minX, minY, maxX, maxY)) { final int mode = pdf_data.f_writingMode[i]; localFragments[currentPoint] = new Line(pdf_data, i); final StringBuilder startTags = new StringBuilder(localFragments[currentPoint].getRawData().substring(0, localFragments[currentPoint].getRawData().indexOf(marker))); final String contentText = localFragments[currentPoint].getRawData().substring(localFragments[currentPoint].getRawData().indexOf(marker), localFragments[currentPoint].getRawData().indexOf('<', localFragments[currentPoint].getRawData().lastIndexOf(marker))); String endTags = localFragments[currentPoint].getRawData().substring(localFragments[currentPoint].getRawData().lastIndexOf(marker)); //Skips last section of text endTags = endTags.substring(endTags.indexOf('<')); final StringTokenizer tokenizer = new StringTokenizer(contentText, marker); boolean setX1 = true; float width = 0; while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); final float xCoord = (Float.parseFloat(token)); token = tokenizer.nextToken(); width = Float.parseFloat(token); token = tokenizer.nextToken(); final String character = token; if (setX1) { if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) { localFragments[currentPoint].setX1(xCoord); } else { localFragments[currentPoint].setY2(xCoord); } setX1 = false; } if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) { localFragments[currentPoint].setX2(xCoord); } else { localFragments[currentPoint].setY1(xCoord); } boolean storeValues = false; if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) { if (minX < xCoord && (xCoord + width) < maxX) { storeValues = true; } } else { if (minY < xCoord && (xCoord + width) < maxY) { storeValues = true; } } if (storeValues) { startTags.append(marker); startTags.append(xCoord); //Add X Coord startTags.append(marker); startTags.append(width); //Add Width startTags.append(marker); startTags.append(character); //Add Letter } } localFragments[currentPoint].setRawData(startTags.append(endTags).toString()); if ((mode == PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT)) { localFragments[currentPoint].setX2(localFragments[currentPoint].getX2() + width); } else { localFragments[currentPoint].setY1(localFragments[currentPoint].getY1() + width); } currentPoint++; } } fragments = new Line[currentPoint]; for (int i = 0; i != currentPoint; i++) { fragments[i] = localFragments[i]; } } private static boolean isFragmentWithinArea(final PdfData pdf_data, final int i, final int minX, final int minY, final int maxX, final int maxY) { //extract values final float x1 = pdf_data.f_x1[i]; final float x2 = pdf_data.f_x2[i]; final float y1 = pdf_data.f_y1[i]; final float y2 = pdf_data.f_y2[i]; final int mode = pdf_data.f_writingMode[i]; final float height; switch (mode) { case PdfData.HORIZONTAL_LEFT_TO_RIGHT: case PdfData.HORIZONTAL_RIGHT_TO_LEFT: height = y1 - y2; if ((((minX < x1 && x1 < maxX) || (minX < x2 && x2 < maxX)) || //Area contains the x1 or x2 coords ((x1 < minX && minX < x2) || (x1 < maxX && maxX < x2)) //Area is within the x1 and x2 coords ) && (minY < y2 + (height / 4) && y2 + (height * 0.75) < maxY) //Area also contains atleast 3/4 of the text y coords ) { return true; } break; case PdfData.VERTICAL_BOTTOM_TO_TOP: case PdfData.VERTICAL_TOP_TO_BOTTOM: height = x2 - x1; if ((((minY < y1 && y1 < maxY) || (minY < y2 && y2 < maxY)) || //Area contains the x1 or x2 coords ((y2 < minY && minY < y1) || (y2 < maxY && maxY < y1)) //Area is within the x1 and x2 coords ) && (minX < x1 + (height / 4) && x1 + (height * 0.75) < maxX) //Area also contains atleast 3/4 of the text y coords ) { return true; } break; } return false; } /** * make sure co-ords valid and throw exception if not */ private static int[] validateCoordinates(int x1, int y1, int x2, int y2) { if ((x1 > x2) | (y1 < y2)) { if (x1 > x2) { final int temp = x1; x1 = x2; x2 = temp; LogWriter.writeLog("x1 > x2, coordinates were swapped to validate"); } if (y1 < y2) { final int temp = y1; y1 = y2; y2 = temp; LogWriter.writeLog("y1 < y2, coordinates were swapped to validate"); } } return new int[]{x1, y1, x2, y2}; } // /** * Search with in pdf page currently loaded and return the areas * of the results found as an array of float values. *

* Method to find text in the specified area allowing for the text to be split across multiple lines.
* * @param terms = the text to search for * @param searchType = info on how to search the pdf * @return the coords of the found text in a float[] where the coords are pdf page coords. * The origin of the coords is the bottom left hand corner (on unrotated page) organised in the following order.
* [0]=result x1 coord
* [1]=result y1 coord
* [2]=result x2 coord
* [3]=result y2 coord
* [4]=either -101 to show that the next text area is the remainder of this word on another line else any other value is ignored.
* @throws PdfException if the page content being search contains invalid data that the search can not recover from */ protected final float[] findText( final String[] terms, final int searchType) throws PdfException { //Failed to supply search terms to do nothing if (terms == null) { return new float[]{}; } //Search result and teaser holders final Vector_Float resultCoords = new Vector_Float(0); final Vector_String resultTeasers = new Vector_String(0); //Extract the text data into local arrays for searching copyToArrays(); //Remove any hidden text on page as should not be found cleanupShadowsAndDrownedObjects(false); //Get unused text objects and sort them for correct searching // final int[] items = getsortedUnusedFragments(true, false); final Line[] localLines = fragments.clone(); final int[] unsorted = getWritingModeCounts(localLines); final int[] writingModes = getWritingModeOrder(unsorted); for (int u = 0; u != writingModes.length; u++) { final int mode = writingModes[u]; if (unsorted[mode] != 0) { searchWritingMode(mode, searchType, terms, resultCoords, resultTeasers); } } //Return coord data for search results return resultCoords.get(); } private void searchWritingMode(final int mode, final int searchType, final String[] terms, final Vector_Float resultCoords, final Vector_String resultTeasers) throws PdfException { //Flags to control the different search options boolean firstOccuranceOnly = false; boolean wholeWordsOnly = false; boolean foundFirst = false; boolean useRegEx = false; //Merge text localFragments into lines as displayed on page createLinesForSearch(mode, false, false, true); //Bitwise flags for regular expressions engine, options always required final int options = loadSearcherOptions(searchType); //Only find first occurance of each search term if ((searchType & SearchType.FIND_FIRST_OCCURANCE_ONLY) == SearchType.FIND_FIRST_OCCURANCE_ONLY) { firstOccuranceOnly = true; } //Only find whole words, not partial words if ((searchType & SearchType.WHOLE_WORDS_ONLY) == SearchType.WHOLE_WORDS_ONLY) { wholeWordsOnly = true; } //Allow the use of regular expressions symbols if ((searchType & SearchType.USE_REGULAR_EXPRESSIONS) == SearchType.USE_REGULAR_EXPRESSIONS) { useRegEx = true; } //Check if coords need swapping final boolean valuesSwapped = (mode == PdfData.VERTICAL_BOTTOM_TO_TOP || mode == PdfData.VERTICAL_TOP_TO_BOTTOM); //Portions of text to perform the search on and find teasers final String searchText = buildSearchText(false, mode); final String coordsText = buildSearchText(true, mode); //Hold starting point data at page rotation int[] resultStart; //Work through the search terms one at a time for (int j = 0; j != terms.length; j++) { String searchValue = alterStringTooDisplayOrder(terms[j]); //Set the default separator between words in a search term String sep = " "; //Multiline needs space or newline to be recognised as word separators if ((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS) { sep = "[ \\\\n]+"; } //if not using reg ex add reg ex literal flags around the text and word separators if (!useRegEx) { searchValue = "\\Q" + searchValue + "\\E"; sep = "\\\\E" + sep + "\\\\Q"; } //If word seperator has changed, replace all spaces with modified seperator if (!sep.equals(" ")) { searchValue = searchValue.replaceAll(" ", sep); } //Surround search term with word boundry tags to match whole words if (wholeWordsOnly) { searchValue = "\\b" + searchValue + "\\b"; } //Create pattern to match search term final Pattern searchTerm = Pattern.compile(searchValue, options); //Create pattern to match search term with two words before and after final Pattern teaserTerm = Pattern.compile("(?:\\S+\\s)?\\S*(?:\\S+\\s)?\\S*" + searchValue + "\\S*(?:\\s\\S+)?\\S*(?:\\s\\S+)?", options); //So long as text data is not null if (searchText != null) { //Create two matchers for finding search term and teaser final Matcher termFinder = searchTerm.matcher(searchText); final Matcher teaserFinder = teaserTerm.matcher(searchText); final boolean needToFindTeaser = true; //Keep looping till no result is returned while (termFinder.find()) { resultStart = null; //Make note of the text found and index in the text String foundTerm = termFinder.group(); final int termStarts = termFinder.start(); final int termEnds = termFinder.end() - 1; //If storing teasers if (includeTease) { if (includeHTMLtags) { foundTerm = "" + foundTerm + ""; } if (needToFindTeaser) { findTeaser(foundTerm, teaserFinder, termStarts, termEnds, resultTeasers); } } getResultCoords(coordsText, mode, resultStart, termStarts, termEnds, valuesSwapped, resultCoords); //If only finding first occurance, //Stop searching this text data for search term. if (firstOccuranceOnly) { foundFirst = true; break; } } //If only finding first occurance and first is found, //Stop searching all text data for this search term. if (firstOccuranceOnly && foundFirst) { break; } } } //Remove any trailing empty values resultCoords.trim(); //If including tease values if (includeTease) { storeTeasers(resultTeasers); } } private void getResultCoords(final String coordText, final int mode, int[] resultStart, int termStarts, final int termEnds, final boolean valuesSwapped, final Vector_Float resultCoords) { //Get coords of found text for highlights float currentX; float width; final char MARKER2 = PdfGroupingAlgorithms.MARKER2; //Track point in text data line (without coord data) int pointInLine = -1; //Track line on page int lineCounter = 0; //Skip null values and value not in the correct writing mode to ensure correct result coords while (lines[lineCounter].getRawData() == null || Strip.stripXML(lines[lineCounter].getRawData(), true).toString().isEmpty() || mode != lines[lineCounter].getWritingMode()) { lineCounter++; } //Flags used to catch if result is split accross lines boolean startFound = false; boolean endFound = false; //Cycle through coord text looking for coords of this result //Ignore first value as it is known to be the first marker for (int pointer = 1; pointer < coordText.length(); pointer++) { // find second marker and get x coord int startPointer = pointer; while (pointer < coordText.length()) { if (coordText.charAt(pointer) == MARKER2) { break; } pointer++; } //Convert text to float value for x coord currentX = Float.parseFloat(coordText.substring(startPointer, pointer)); pointer++; // find third marker and get width startPointer = pointer; while (pointer < coordText.length()) { if (coordText.charAt(pointer) == MARKER2) { break; } pointer++; } //Convert text to float value for character width width = Float.parseFloat(coordText.substring(startPointer, pointer)); pointer++; // find fourth marker and get text (character) startPointer = pointer; while (pointer < coordText.length()) { if (coordText.charAt(pointer) == MARKER2) { break; } pointer++; } //Store text to check for newline character later final String text = coordText.substring(startPointer, pointer); pointInLine += text.length(); //Start of term not found yet. //Point in line is equal to or greater than start of the term. //Store coords and mark start as found. if (!startFound && pointInLine >= termStarts) { int currentY = (int) lines[lineCounter].getY1(); if (valuesSwapped) { currentY = (int) lines[lineCounter].getX2(); } resultStart = new int[]{(int) currentX, currentY}; startFound = true; } //End of term not found yet. //Point in line is equal to or greater than end of the term. //Store coords and mark end as found. if (!endFound && pointInLine >= termEnds) { int currentY = (int) lines[lineCounter].getY2(); if (valuesSwapped) { currentY = (int) lines[lineCounter].getX1(); } storeResultsCoords(valuesSwapped, mode, resultCoords, resultStart[0], resultStart[1], (currentX + width), currentY, 0.0f); endFound = true; } //Using multi line option. //Start of term found. //End of term not found. //New line character found. //Set up multi line result. if (startFound && !endFound && text.contains("\n")) { storeResultsCoords(valuesSwapped, mode, resultCoords, resultStart[0], resultStart[1], (currentX + width), lines[lineCounter].getY2(), MULTIPLE_AREA_RESULT); //Set start of term as not found startFound = false; //Set this point in line as start of next term //Guarantees next character is found as //start of the next part of the search term termStarts = pointInLine; } //In multiline mode we progress the line number when we find a \n //This is to allow the correct calculation of y coords if (text.contains("\n")) { lineCounter++; //If current content pointed at is null or not the correct writing mode, skip value until data is found while (lineCounter < lines.length && (lines[lineCounter].getRawData() == null || Strip.stripXML(lines[lineCounter].getRawData(), true).toString().isEmpty() || mode != lines[lineCounter].getWritingMode())) { lineCounter++; } } } } protected void clearStoredTeasers() { multipleTermTeasers.clear(); } private void storeTeasers(final Vector_String resultTeasers) { //Remove any trailing empty values resultTeasers.trim(); final String[] results = resultTeasers.get(); for (int i = 0; i != results.length; i++) { multipleTermTeasers.add(results[i]); } //Prevent issue this not getting cleared between writing modes //resulting in duplicate teasers resultTeasers.clear(); } private static void storeResultsCoords(final boolean valuesSwapped, final int mode, final Vector_Float resultCoords, final float x1, final float y1, final float x2, final float y2, final float connected) { //Set ends coords if (valuesSwapped) { if (mode == PdfData.VERTICAL_BOTTOM_TO_TOP) { resultCoords.addElement(y2); resultCoords.addElement(x2); resultCoords.addElement(y1); resultCoords.addElement(x1); resultCoords.addElement(connected); //Mark next result as linked } else { resultCoords.addElement(y2); resultCoords.addElement(x1); resultCoords.addElement(y1); resultCoords.addElement(x2); resultCoords.addElement(connected); //Mark next result as linked } } else { resultCoords.addElement(x1); resultCoords.addElement(y1); resultCoords.addElement(x2); resultCoords.addElement(y2); resultCoords.addElement(connected); //Mark next result as linked } } private void findTeaser(String teaser, final Matcher teaserFinder, final int termStarts, final int termEnds, final Vector_String resultTeasers) { if (teaserFinder.find()) { //Get a teaser if found and set the search term to bold is allowed if (teaserFinder.start() < termStarts && teaserFinder.end() > termEnds) { //replace default with found teaser teaser = teaserFinder.group(); if (includeHTMLtags) { //Calculate points to add bold tags final int teaseStarts = termStarts - teaserFinder.start(); final int teaseEnds = (termEnds - teaserFinder.start()) + 1; //Add bold tags teaser = teaser.substring(0, teaseStarts) + "" + teaser.substring(teaseStarts, teaseEnds) + "" + teaser.substring(teaseEnds, teaser.length()); } teaserFinder.region(termEnds + 1, teaserFinder.regionEnd()); } } //Store teaser resultTeasers.addElement(teaser); } private static String alterStringTooDisplayOrder(final String testTerm) { String currentBlock = ""; String searchValue = ""; byte lastDirection = Character.getDirectionality(testTerm.charAt(0)); for (int i = 0; i != testTerm.length(); i++) { byte dir = Character.getDirectionality(testTerm.charAt(i)); //Only track is changing from left to right or right to left switch (dir) { case Character.DIRECTIONALITY_RIGHT_TO_LEFT: case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: dir = Character.DIRECTIONALITY_RIGHT_TO_LEFT; break; case Character.DIRECTIONALITY_LEFT_TO_RIGHT: case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: dir = Character.DIRECTIONALITY_LEFT_TO_RIGHT; break; default: dir = lastDirection; break; } if (dir != lastDirection) { //Save and reset block is direction changed searchValue += currentBlock; currentBlock = ""; lastDirection = dir; } //Store value based on writing mode if (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT) { currentBlock = testTerm.charAt(i) + currentBlock; } else { currentBlock += testTerm.charAt(i); } } searchValue += currentBlock; return searchValue; } private String buildSearchText(final boolean includeCoords, final int mode) { //Portions of text to perform the search on and find teasers String searchText; //Merge all text into one with \n line separators //This will allow checking for multi line split results final StringBuilder str = new StringBuilder(); for (int i = 0; i != lines.length; i++) { if (lines[i].getRawData() != null && mode == lines[i].getWritingMode()) { str.append(lines[i].getRawData()).append('\n'); } } //Remove double spaces, replacing them with single spaces searchText = removeDuplicateSpaces(str.toString()); //Strip xml and coords data from content and keep text data if (!includeCoords) { searchText = removeHiddenMarkers(searchText); } searchText = Strip.stripXML(searchText, true).toString(); //Store text in the search and teaser arrays return searchText; } private static String removeDuplicateSpaces(String textValue) { if (textValue.contains(" ")) { textValue = textValue.replace(" ", " "); } return textValue; } private static int loadSearcherOptions(final int searchType) { //Bitwise flags for regular expressions engine, options always required int options = 0; //Turn on case sensitive mode if ((searchType & SearchType.CASE_SENSITIVE) != SearchType.CASE_SENSITIVE) { options = (options | Pattern.CASE_INSENSITIVE); } //Allow search to find split line results if ((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS) { options = (options | Pattern.MULTILINE | Pattern.DOTALL); } return options; } private static int[] getWritingModeOrder(final int[] unsorted) { final int[] sorted = {unsorted[0], unsorted[1], unsorted[2], unsorted[3]}; //Set all to -1 so we can tell if it's been set yet final int[] writingModes = {-1, -1, -1, -1}; Arrays.sort(sorted); for (int i = 0; i != unsorted.length; i++) { for (int j = 0; j < sorted.length; j++) { if (unsorted[i] == sorted[j]) { int pos = j - 3; if (pos < 0) { pos = -pos; } if (writingModes[pos] == -1) { writingModes[pos] = i; j = sorted.length; } } } } return writingModes; } private int[] getWritingModeCounts(final Line[] items) { //check orientation and get preferred. Items not correct will be ignored int l2r = 0; int r2l = 0; int t2b = 0; int b2t = 0; for (int i = 0; i != items.length; i++) { switch (items[i].getWritingMode()) { case 0: l2r++; break; case 1: r2l++; break; case 2: t2b++; break; case 3: b2t++; break; } } return new int[]{l2r, r2l, t2b, b2t}; } /** * remove shadows from text created by double printing of text and drowned * items where text inside other text */ private void cleanupShadowsAndDrownedObjects(final boolean avoidSpaces) { //get list of items // final int[] items = getUnusedFragments(); final int count = fragments.length; int master, child; String separator; float diff; //work through objects and eliminate shadows or roll together overlaps for (int p = 0; p < count; p++) { //master item master = p; //ignore used items //work out mid point in text float midX = (fragments[master].getX1() + fragments[master].getX2()) / 2; float midY = (fragments[master].getY1() + fragments[master].getY2()) / 2; for (int p2 = p + 1; p2 < count; p2++) { //item to test against child = p2; //Ignore localFragments that have been used or have no width if ((fragments[child].getX1() != fragments[child].getX2()) && (!fragments[child].hasMerged()) && (!fragments[master].hasMerged())) { float fontDiff = fragments[child].getFontSize() - fragments[master].getFontSize(); if (fontDiff < 0) { fontDiff = -fontDiff; } diff = (fragments[child].getX2() - fragments[child].getX1()) - (fragments[master].getX2() - fragments[master].getX1()); if (diff < 0) { diff = -diff; } //stop spurious matches on overlapping text if (fontDiff == 0 && (midX > fragments[child].getX1()) && (midX < fragments[child].getX2()) && (diff < 10) && (midY < fragments[child].getY1()) && (midY > fragments[child].getY2())) { fragments[child].setMerged(true); //pick up drowned text items (item inside another) } else { final boolean a_in_b = (fragments[child].getX1() > fragments[master].getX1()) && (fragments[child].getX2() < fragments[master].getX2()) && (fragments[child].getY1() < fragments[master].getY1()) && (fragments[child].getY2() > fragments[master].getY2()); final boolean b_in_a = (fragments[master].getX1() > fragments[child].getX1()) && (fragments[master].getX2() < fragments[child].getX2()) && (fragments[master].getY1() < fragments[child].getY1()) && (fragments[master].getY2() > fragments[child].getY2()); //merge together if (a_in_b || b_in_a) { //get order right - bottom y2 underneath if (fragments[master].getY2() > fragments[child].getY2()) { separator = getLineDownSeparator(fragments[master].getRawData(), fragments[child].getRawData()); if ((!avoidSpaces) || (separator.indexOf(' ') == -1)) { merge(fragments[master], fragments[child], separator); } } else { separator = getLineDownSeparator(fragments[child].getRawData(), fragments[master].getRawData()); if (!avoidSpaces || separator.indexOf(' ') == -1) { merge(fragments[master], fragments[child], separator); } } //recalculate as may have changed midX = (fragments[master].getX1() + fragments[master].getX2()) / 2; midY = (fragments[master].getY1() + fragments[master].getY2()) / 2; } } } } } } /** * workout if we should use space, CR or no separator when joining lines */ private static String getLineDownSeparator(final String rawLine1, final String rawLine2) { String returnValue = " "; //space is default final boolean hasUnderline = false; //get 2 lines without any XML or spaces so we can look at last char StringBuilder line1 = new StringBuilder(rawLine1); StringBuilder line2 = new StringBuilder(rawLine2); line1 = Strip.trim(line1); line2 = Strip.trim(line2); //get lengths and if appropriate perform tests final int line1Len = line1.length(); final int line2Len = line2.length(); if ((line1Len > 1) && (line2Len > 1)) { //get chars to test final char line1Char2 = line1.charAt(line1Len - 1); final char line1Char1 = line1.charAt(line1Len - 2); final char line2Char1 = line2.charAt(0); final char line2Char2 = line2.charAt(1); //deal with hyphenation first - ignore unless :- or space- final String hyphen_values = ""; if (hyphen_values.indexOf(line1Char2) != -1) { returnValue = ""; //default of nothing if (line1Char1 == ':') { returnValue = "\n"; } if (line1Char2 == ' ') { returnValue = " "; } //paragraph breaks if full stop and next line has ascii char or Capital Letter } else if ( ((line1Char1 == '.') || (line1Char2 == '.')) && (Character.isUpperCase(line2Char1) || (line2Char1 == '&') || Character.isUpperCase(line2Char2) || (line2Char2 == '&'))) { returnValue = "\n"; } } //add an underline if appropriate if (hasUnderline) { returnValue += '\n'; } return returnValue; } /** * general routine to see if we add a space between 2 text localFragments */ private String isGapASpace(final int c, final int l, final float actualGap, final boolean addMultiplespaceXMLTag, final int writingMode) { String sep = ""; float gap; //use smaller gap final float gapA = fragments[c].getSpaceWidth() * fragments[c].getFontSize(); final float gapB = fragments[l].getSpaceWidth() * fragments[l].getFontSize(); if (gapA > gapB) { gap = gapB; } else { gap = gapA; } gap = (actualGap / (gap / 1000)); //Round values to closest full integer as float -> int conversion rounds down if (gap > 0.51f && gap < 1) { gap = 1; } final int spaceCount = (int) gap; if (spaceCount > 0) { sep = " "; } //add an XML tag to flag multiple spaces if (spaceCount > 1 && addMultiplespaceXMLTag && writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) { sep = " "; } return sep; } /** * convert localFragments into lines of text */ @SuppressWarnings("unused") private void createLinesForSearch(final int mode, final boolean breakOnSpace, final boolean addMultiplespaceXMLTag, final boolean isSearch) throws PdfException { String separator; final boolean debug = false; //create local copies of arrays final Line[] localLines = fragments.clone(); // final boolean[] isUsed = new boolean[lines.length]; int finalCount = localLines.length; for (int i = 0; i != localLines.length; i++) { if (localLines[i].hasMerged) { finalCount--; } } //reverse order if text right to left if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) { for (int i = 0; i < localLines.length; i++) { localLines[i] = fragments[localLines.length - i - 1]; } } //scan items joining best fit to right of each fragment to build lines. for (int master = 0; master < localLines.length; master++) { int id = -1; //float smallest_gap = -1, gap, yMidPt; if (!localLines[master].hasMerged() && localLines[master].getWritingMode() == mode) { if (debug) { System.out.println("Look for match with " + removeHiddenMarkers(localLines[master].getRawData())); } for (int child = 0; child < localLines.length && id == -1; child++) { /* * Coordinates altered so x axis positive follows line direction * and y axis negative follows paragraph direction. * Coordinates in the order x1, y1, x2, y2 */ final float[] masterCoords = getCoordsForWritingMode(localLines[master], mode); final float[] childCoords = getCoordsForWritingMode(localLines[child], mode); if (!localLines[child].hasMerged() && master != child && localLines[master].getWritingMode() == localLines[child].getWritingMode() && childCoords[0] != childCoords[2]) { if (debug) { System.out.println("Checking " + removeHiddenMarkers(localLines[child].getRawData())); } //Get central points final float mx = masterCoords[0] + ((masterCoords[2] - masterCoords[0]) / 2); final float my = masterCoords[3] + ((masterCoords[1] - masterCoords[3]) / 2); final float cx = childCoords[0] + ((childCoords[2] - childCoords[0]) / 2); final float cy = childCoords[3] + ((childCoords[1] - childCoords[3]) / 2); float smallestHeight = (masterCoords[1] - masterCoords[3]); final float fontDifference = (childCoords[1] - childCoords[3]) - smallestHeight; if (fontDifference < 0) { smallestHeight = (childCoords[1] - childCoords[3]); } //Don't merge is font of 1 is twice the size if (Math.abs(fontDifference) < smallestHeight * 2) { //Check for the same line by checking the center of //child is within master area if (Math.abs(my - cy) < (smallestHeight * 0.5)) { if (mx < cx) { //Child on right final float distance = childCoords[0] - masterCoords[2]; if (distance <= smallestHeight / 2) { id = child; } } } } //Match has been found if (id != -1) { float possSpace = childCoords[0] - masterCoords[2]; if (mode == PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode == PdfData.VERTICAL_TOP_TO_BOTTOM) { possSpace = -possSpace; } //add space if gap between this and last object separator = isGapASpace(master, id, possSpace, addMultiplespaceXMLTag, mode); //merge if adjoin if (breakOnSpace && separator.startsWith(" ")) { break; } if (debug) { System.out.println("Merge items " + master + " & " + id); System.out.println("c : " + removeHiddenMarkers(localLines[master].getRawData())); System.out.println("id : " + removeHiddenMarkers(localLines[id].getRawData())); System.out.println(""); } if ((isSearch && (child != master && ((childCoords[0] > masterCoords[0] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM) || (childCoords[0] < masterCoords[0] && mode == PdfData.VERTICAL_TOP_TO_BOTTOM) && localLines[master].getWritingMode() == mode))) || (!isSearch && (child != master && ((childCoords[0] > masterCoords[0] && mode != PdfData.VERTICAL_TOP_TO_BOTTOM) || childCoords[0] < masterCoords[0] && mode == PdfData.VERTICAL_TOP_TO_BOTTOM && localLines[master].getWritingMode() == mode)))) { //see if on right merge(localLines[master], localLines[id], separator); finalCount--; } id = -1; } } } } } lines = new Line[finalCount]; int next = 0; for (int i = 0; i != localLines.length; i++) { if (!localLines[i].hasMerged()) { lines[next] = localLines[i]; next++; } } } private float[] getCoordsForWritingMode(final Line line, final int mode) throws PdfException { final float[] results = new float[4]; //set pointers so left to right text switch (mode) { case PdfData.HORIZONTAL_LEFT_TO_RIGHT: results[0] = line.getX1(); results[2] = line.getX2(); results[1] = line.getY1(); results[3] = line.getY2(); break; case PdfData.HORIZONTAL_RIGHT_TO_LEFT: results[2] = line.getX1(); results[0] = line.getX2(); results[1] = line.getY1(); results[3] = line.getY2(); break; case PdfData.VERTICAL_BOTTOM_TO_TOP: results[0] = line.getY2(); results[2] = line.getY1(); results[1] = line.getX2(); results[3] = line.getX1(); break; case PdfData.VERTICAL_TOP_TO_BOTTOM: results[0] = line.getY2(); results[2] = line.getY1(); results[3] = line.getX1(); results[1] = line.getX2(); break; default: throw new PdfException("Illegal value " + mode + " for currentWritingMode"); } return results; } /** * merge 2 text localFragments together and update co-ordinates */ private void merge(final Line master, final Line child, final String separator) { //update co-ords if (master.getX1() > child.getX1()) { master.setX1(child.getX1()); } if (master.getY1() < child.getY1()) { master.setY1(child.getY1()); } if (master.getX2() < child.getX2()) { master.setX2(child.getX2()); } if (master.getY2() > child.getY2()) { master.setY2(child.getY2()); } final String test = Fonts.fe; StringBuilder masterString = new StringBuilder(master.getRawData()); final StringBuilder childString = new StringBuilder(child.getRawData()); //move if needed and add separator if ((masterString.toString().lastIndexOf(test) != -1)) { final String masterLocal = masterString.toString(); masterString = new StringBuilder(masterLocal.substring(0, masterLocal.lastIndexOf(test))); masterString.append(separator); masterString.append(masterLocal.substring(masterLocal.lastIndexOf(test))); } else { masterString.append(separator); } //Only map out space if text length is longer than 1 if (child.getTextLength() > 1 && masterString.toString().endsWith(" ")) { masterString.deleteCharAt(masterString.lastIndexOf(" ")); } //use font size of second text (ie at end of merged text) master.setFontSize(child.getFontSize()); //Remove excess / redundent xml tags if ((childString.indexOf("", masterString.lastIndexOf("") + 7 == masterString.lastIndexOf(">"))) { childString.replace(childString.indexOf("") + 1, ""); masterString.replace(masterString.lastIndexOf(""), masterString.lastIndexOf("") + 8, ""); } if ((childString.indexOf("", masterString.lastIndexOf("") + 6 == masterString.lastIndexOf(">"))) { childString.replace(childString.indexOf("") + 1, ""); masterString.replace(masterString.lastIndexOf(""), masterString.lastIndexOf("") + 7, ""); } masterString = masterString.append(childString); //track length of text less all tokens master.setTextLength(master.getTextLength() + child.getTextLength()); //set objects to null to flush and log as used child.setRawData(null); child.setMerged(true); master.setRawData(masterString.toString()); // //use font size of second text (ie at end of merged text) // master.setFontSize(child.getFontSize()); // // //add together // StringBuilder content = new StringBuilder(); // content.append(master.getRawData()).append(separator).append(child.getRawData()); // master.setRawData(content.toString()); // // //track length of text less all tokens // master.setTextLength(master.getTextLength()+child.getTextLength()); // // //set objects to null to flush and log as used // child.setRawData(null); // child.setMerged(true); } private void copyToArrays() { final int count = pdf_data.getRawTextElementCount(); fragments = new Line[count]; //set values for (int i = 0; i < count; i++) { fragments[i] = new Line(pdf_data, i); } } /** * sets if we include HTML in teasers * (do we want this is word or this is word as teaser) * * @param value True to include HTML, otherwise false */ protected void setIncludeHTML(final boolean value) { includeHTMLtags = value; } /** * Flag if teasers should be generated whilst searching * * @param value True to generate teasers, otherwise false */ protected void generateTeasers(final boolean value) { includeTease = value; } /** * Return flag to control teaser generation * * @return True if teasers are being generated, otherwise false */ protected boolean isGeneratingTeasers() { return includeTease; } private class Line implements Comparable { private float x1, y1, x2, y2, character_spacing, spaceWidth; private String raw, currentColor; private int text_length, mode, fontSize; private boolean hasMerged; Line(final PdfData pdf_data, final int index) { loadData(pdf_data, index); } private void loadData(final PdfData pdf_data, final int index) { //extract values character_spacing = pdf_data.f_character_spacing[index]; x1 = pdf_data.f_x1[index]; x2 = pdf_data.f_x2[index]; y1 = pdf_data.f_y1[index]; y2 = pdf_data.f_y2[index]; currentColor = pdf_data.colorTag[index]; text_length = pdf_data.text_length[index]; mode = pdf_data.f_writingMode[index]; raw = pdf_data.contents[index]; fontSize = pdf_data.f_end_font_size[index]; spaceWidth = pdf_data.space_width[index]; hasMerged = false; } protected float getX1() { return x1; } protected float getY1() { return y1; } protected float getX2() { return x2; } protected float getY2() { return y2; } protected float getCharacterSpacing() { return character_spacing; } protected float getSpaceWidth() { return spaceWidth; } protected String getRawData() { return raw; } protected String getColorTag() { return currentColor; } protected int getWritingMode() { return mode; } protected int getTextLength() { return text_length; } protected int getFontSize() { return fontSize; } protected boolean hasMerged() { return hasMerged; } protected void setX1(final float value) { x1 = value; } protected void setY1(final float value) { y1 = value; } protected void setX2(final float value) { x2 = value; } protected void setY2(final float value) { y2 = value; } protected void setFontSize(final int value) { fontSize = value; } protected void setRawData(final String value) { raw = value; } protected void setTextLength(final int value) { text_length = value; } protected void setMerged(final boolean value) { hasMerged = value; } @Override public int compareTo(final Line o) { switch (mode) { case PdfData.HORIZONTAL_LEFT_TO_RIGHT: case PdfData.HORIZONTAL_RIGHT_TO_LEFT: return (int) (y1 - o.getY1()); case PdfData.VERTICAL_TOP_TO_BOTTOM: case PdfData.VERTICAL_BOTTOM_TO_TOP: return (int) (x1 - o.getX1()); } return 0; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy