All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.util.pdfbox.PDFBoxHighlighter Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

The newest version!
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/
package com.jaeksoft.searchlib.util.pdfbox;

import java.awt.Dimension;
import java.awt.Rectangle;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;

public class PDFBoxHighlighter extends PDFTextStripper {

	private final String[] keywords;
	private final Collection boxes;
	private final Dimension imageDimension;
	private float xFactor;
	private float yFactor;

	public PDFBoxHighlighter(String[] keywords, Collection boxes,
			Dimension imageDimension) throws IOException {
		this.keywords = keywords;
		this.boxes = boxes;
		this.imageDimension = imageDimension;
	}

	@Override
	protected void startPage(PDPage page) throws IOException {
		super.startPage(page);
		PDRectangle rect = page.findCropBox();
		xFactor = imageDimension.width / rect.getWidth();
		yFactor = imageDimension.height / rect.getHeight();
	}

	@Override
	protected void writeString(String text, List textPositions)
			throws IOException {
		super.writeString(text, textPositions);
		if (textPositions == null)
			return;
		StringBuilder term = new StringBuilder();
		List rects = new ArrayList();
		for (TextPosition tp : textPositions) {
			String str = tp.getCharacter();
			if (str.length() > 1) {
				handleTerm(term, rects);
				term = new StringBuilder();
				rects.clear();
				addRect(tp, rects);
				handleTerm(str, rects);
				continue;
			}
			char c = str.charAt(0);
			if (!Character.isLetterOrDigit(c)) {
				handleTerm(term, rects);
				term = new StringBuilder();
				rects.clear();
				continue;
			}
			term.append(c);
			addRect(tp, rects);
		}
		handleTerm(term, rects);
	}

	final private void addRect(TextPosition tp, List rects) {
		Rectangle rect = new Rectangle();
		float xdelta = tp.getWidthDirAdj() * .2F;
		float ydelta = tp.getHeightDir() * .2F;
		rect.x = (int) ((tp.getXDirAdj() - xdelta) * xFactor);
		rect.y = (int) ((tp.getYDirAdj() - (tp.getHeightDir() + ydelta)) * yFactor);
		rect.width = (int) ((tp.getWidthDirAdj() + xdelta * 2) * xFactor);
		rect.height = (int) ((tp.getHeightDir() + ydelta * 2) * yFactor);
		rects.add(rect);
	}

	final private void handleTerm(CharSequence term, List rects) {
		if (term == null)
			return;
		if (term.length() == 0)
			return;
		if (rects == null)
			return;
		if (rects.size() == 0)
			return;
		String str = term.toString();
		for (String keyword : keywords) {
			if (keyword.equalsIgnoreCase(str)) {
				Rectangle unionRect = new Rectangle(rects.get(0));
				for (Rectangle rect : rects) {
					if (rect.y != unionRect.y) {
						unionRect = null;
						break;
					}
					unionRect = unionRect.union(rect);
				}
				if (unionRect != null)
					boxes.add(unionRect);
				else
					for (Rectangle rect : rects)
						boxes.add(rect);
				return;
			}
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy