com.jaeksoft.searchlib.ocr.HocrDocument Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.
The newest version!
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2012-2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.ocr;

import java.awt.Rectangle;
import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.xml.sax.InputSource;

import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.parser.ParserFieldEnum;
import com.jaeksoft.searchlib.parser.ParserResultItem;
import com.jaeksoft.searchlib.parser.htmlParser.DomHtmlNode;
import com.jaeksoft.searchlib.parser.htmlParser.HtmlNodeAbstract;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.IOUtils;

public class HocrDocument {

	private final List paragraphList;

	private final Map> boxMap;

	private HocrDocument() {
		paragraphList = new ArrayList(0);
		boxMap = new TreeMap>();
	}

	private final void ocrx_word(HtmlNodeAbstract parentNode,
			StringBuilder currentParagraph) throws SearchLibException {
		if (parentNode == null)
			return;
		String parent_bbox = parentNode.getAttributeText("title").substring(5);
		for (HtmlNodeAbstract xwordNode : parentNode.getNodes("span")) {
			if (!"ocrx_word".equals(xwordNode.getAttributeText("class")))
				continue;
			String word_bbox = xwordNode.getAttributeText("title").substring(5);
			String word = xwordNode.getText();
			if (word == null)
				continue;
			word = word.trim();
			String pword = word;
			if (word.length() == 0)
				continue;
			word = word.toLowerCase();
			List boxList = boxMap.get(word);
			if (boxList == null) {
				boxList = new ArrayList();
				boxMap.put(word, boxList);
			}
			boxList.add(new HocrBox(word_bbox == null ? parent_bbox : word_bbox));
			currentParagraph.append(pword);
			currentParagraph.append(' ');
		}
	}

	public HocrDocument(File ocrFile) throws SearchLibException {
		this();
		FileInputStream fis = null;
		try {

			fis = new FileInputStream(ocrFile);

			InputSource inputSource = new InputSource(fis);
			inputSource.setEncoding("UTF-8");
			DomHtmlNode doc = new DomHtmlNode(DomUtils.readXml(inputSource,
					true));

			for (HtmlNodeAbstract pageNode : doc.getNodes("html", "body",
					"div")) {
				if (!"ocr_page".equals(pageNode.getAttributeText("class")))
					continue;
				for (HtmlNodeAbstract areaNode : pageNode.getNodes("div")) {
					if (!"ocr_carea".equals(areaNode.getAttributeText("class")))
						continue;
					for (HtmlNodeAbstract parNode : areaNode.getNodes("p")) {
						if (!"ocr_par"
								.equals(parNode.getAttributeText("class")))
							continue;
						StringBuilder currentParagraph = new StringBuilder();
						for (HtmlNodeAbstract lineNode : parNode
								.getNodes("span")) {
							if (!"ocr_line".equals(lineNode
									.getAttributeText("class")))
								continue;
							for (HtmlNodeAbstract wordNode : lineNode
									.getNodes("span")) {
								if (!"ocr_word".equals(wordNode
										.getAttributeText("class")))
									continue;
								ocrx_word(wordNode, currentParagraph);
							}
							ocrx_word(lineNode, currentParagraph);
						}
						paragraphList.add(currentParagraph);
					}
				}
			}
		} catch (Exception e) {
			Logging.error("Unable to read ocr file: " + ocrFile == null ? ""
					: ocrFile.getAbsolutePath() + " " + ocrFile.length());
			throw new SearchLibException(e);
		} finally {
			IOUtils.close(fis);
		}
	}

	public HocrDocument(JSONObject jsonObject) throws SearchLibException {
		this();
		for (Object key : jsonObject.keySet()) {
			JSONArray jsonArray = (JSONArray) jsonObject.get(key);
			List hocrBox = new ArrayList(0);
			for (Object obj : jsonArray)
				hocrBox.add(new HocrBox(obj.toString()));
			boxMap.put(key.toString(), hocrBox);
		}
	}

	public void putTextToParserField(ParserResultItem result,
			ParserFieldEnum parserField) {
		for (StringBuilder paragraph : paragraphList)
			result.addField(parserField, paragraph.toString().trim());
	}

	public void putHocrToParserField(ParserResultItem result,
			ParserFieldEnum parserField) {
		result.addField(parserField, getJsonBoxMap().toJSONString());
	}

	@SuppressWarnings("unchecked")
	public JSONObject getJsonBoxMap() {
		JSONObject jsonObject = new JSONObject();
		for (String word : boxMap.keySet()) {
			JSONArray jsonBoxes = new JSONArray();
			for (HocrBox box : boxMap.get(word))
				jsonBoxes.add(box.toString());
			jsonObject.put(word, jsonBoxes);
		}
		return jsonObject;
	}

	public void addBoxes(String keyword, List boxList,
			float xFactor, float yFactor) {
		if (keyword == null)
			return;
		keyword = keyword.toLowerCase().trim();
		if (keyword.length() == 0)
			return;
		List boxes = boxMap.get(keyword);
		if (boxes == null)
			return;
		for (HocrBox box : boxes)
			box.addRectangle(boxList, xFactor, yFactor);
	}

	final public static void main(String[] args) throws SearchLibException {
		HocrDocument hocrDocument = new HocrDocument(new File(
				"/Users/ekeller/Desktop/ossocr1998028053342416847.html"));
		System.out.println(hocrDocument.getJsonBoxMap().size());
	}
}