
com.jaeksoft.searchlib.ocr.HocrDocument Maven / Gradle / Ivy
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2012-2013 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.ocr;
import java.awt.Rectangle;
import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.xml.sax.InputSource;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.parser.ParserFieldEnum;
import com.jaeksoft.searchlib.parser.ParserResultItem;
import com.jaeksoft.searchlib.parser.htmlParser.DomHtmlNode;
import com.jaeksoft.searchlib.parser.htmlParser.HtmlNodeAbstract;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.IOUtils;
public class HocrDocument {
private final List paragraphList;
private final Map> boxMap;
private HocrDocument() {
paragraphList = new ArrayList(0);
boxMap = new TreeMap>();
}
private final void ocrx_word(HtmlNodeAbstract> parentNode,
StringBuilder currentParagraph) throws SearchLibException {
if (parentNode == null)
return;
String parent_bbox = parentNode.getAttributeText("title").substring(5);
for (HtmlNodeAbstract> xwordNode : parentNode.getNodes("span")) {
if (!"ocrx_word".equals(xwordNode.getAttributeText("class")))
continue;
String word_bbox = xwordNode.getAttributeText("title").substring(5);
String word = xwordNode.getText();
if (word == null)
continue;
word = word.trim();
String pword = word;
if (word.length() == 0)
continue;
word = word.toLowerCase();
List boxList = boxMap.get(word);
if (boxList == null) {
boxList = new ArrayList();
boxMap.put(word, boxList);
}
boxList.add(new HocrBox(word_bbox == null ? parent_bbox : word_bbox));
currentParagraph.append(pword);
currentParagraph.append(' ');
}
}
public HocrDocument(File ocrFile) throws SearchLibException {
this();
FileInputStream fis = null;
try {
fis = new FileInputStream(ocrFile);
InputSource inputSource = new InputSource(fis);
inputSource.setEncoding("UTF-8");
DomHtmlNode doc = new DomHtmlNode(DomUtils.readXml(inputSource,
true));
for (HtmlNodeAbstract> pageNode : doc.getNodes("html", "body",
"div")) {
if (!"ocr_page".equals(pageNode.getAttributeText("class")))
continue;
for (HtmlNodeAbstract> areaNode : pageNode.getNodes("div")) {
if (!"ocr_carea".equals(areaNode.getAttributeText("class")))
continue;
for (HtmlNodeAbstract> parNode : areaNode.getNodes("p")) {
if (!"ocr_par"
.equals(parNode.getAttributeText("class")))
continue;
StringBuilder currentParagraph = new StringBuilder();
for (HtmlNodeAbstract> lineNode : parNode
.getNodes("span")) {
if (!"ocr_line".equals(lineNode
.getAttributeText("class")))
continue;
for (HtmlNodeAbstract> wordNode : lineNode
.getNodes("span")) {
if (!"ocr_word".equals(wordNode
.getAttributeText("class")))
continue;
ocrx_word(wordNode, currentParagraph);
}
ocrx_word(lineNode, currentParagraph);
}
paragraphList.add(currentParagraph);
}
}
}
} catch (Exception e) {
Logging.error("Unable to read ocr file: " + ocrFile == null ? ""
: ocrFile.getAbsolutePath() + " " + ocrFile.length());
throw new SearchLibException(e);
} finally {
IOUtils.close(fis);
}
}
public HocrDocument(JSONObject jsonObject) throws SearchLibException {
this();
for (Object key : jsonObject.keySet()) {
JSONArray jsonArray = (JSONArray) jsonObject.get(key);
List hocrBox = new ArrayList(0);
for (Object obj : jsonArray)
hocrBox.add(new HocrBox(obj.toString()));
boxMap.put(key.toString(), hocrBox);
}
}
public void putTextToParserField(ParserResultItem result,
ParserFieldEnum parserField) {
for (StringBuilder paragraph : paragraphList)
result.addField(parserField, paragraph.toString().trim());
}
public void putHocrToParserField(ParserResultItem result,
ParserFieldEnum parserField) {
result.addField(parserField, getJsonBoxMap().toJSONString());
}
@SuppressWarnings("unchecked")
public JSONObject getJsonBoxMap() {
JSONObject jsonObject = new JSONObject();
for (String word : boxMap.keySet()) {
JSONArray jsonBoxes = new JSONArray();
for (HocrBox box : boxMap.get(word))
jsonBoxes.add(box.toString());
jsonObject.put(word, jsonBoxes);
}
return jsonObject;
}
public void addBoxes(String keyword, List boxList,
float xFactor, float yFactor) {
if (keyword == null)
return;
keyword = keyword.toLowerCase().trim();
if (keyword.length() == 0)
return;
List boxes = boxMap.get(keyword);
if (boxes == null)
return;
for (HocrBox box : boxes)
box.addRectangle(boxList, xFactor, yFactor);
}
final public static void main(String[] args) throws SearchLibException {
HocrDocument hocrDocument = new HocrDocument(new File(
"/Users/ekeller/Desktop/ossocr1998028053342416847.html"));
System.out.println(hocrDocument.getJsonBoxMap().size());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy