de.citec.scie.pdf.structure.Document Maven / Gradle / Ivy
/*
* SCIE -- Spinal Cord Injury Information Extraction
* Copyright (C) 2013, 2014
* Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package de.citec.scie.pdf.structure;
import java.util.ArrayList;
import java.util.Objects;
/**
* This represents a parsed document which is defined as a sequence of pages.
*
* @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
*/
public class Document extends LineSegment {
public final ArrayList content = new ArrayList<>();
public Document() {
}
/**
* {@inheritDoc }
*/
@Override
public int hashCode() {
int hash = 7;
hash = 83 * hash + Objects.hashCode(this.content);
return hash;
}
/**
* {@inheritDoc }
*/
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final Document other = (Document) obj;
if (!Objects.equals(this.content, other.content)) {
return false;
}
return true;
}
/**
* Converts this object to a string by going recursively through the
* underlying page structure and calling their respective toString
* methods.
*
* @return a string showing the plain text content of this Document.
*/
@Override
public String toString() {
final StringBuilder output = new StringBuilder();
for (int i = 0; i < content.size(); i++) {
output.append(content.get(i).toString());
if (i < content.size() - 1) {
output.append("\n\n\n");
}
}
return output.toString();
}
/**
* Does the same as toString but also inserts the beginning and end index of
* each objects respective text representation to this objects
* attributes (which is retrievable by getBegin and getEnd).
*
* @param currentIdx the current index in the plain text representation.
* If you are calling this as a user you should insert 0 here.
* @return the plainText representation of this Document, same as for the
* toString method.
*/
public String indexedToString(int currentIdx) {
setBegin(currentIdx);
final StringBuilder output = new StringBuilder();
for (int i = 0; i < content.size(); i++) {
final String pageStr = content.get(i).indexedToString(currentIdx);
output.append(pageStr);
currentIdx += pageStr.length();
if (i < content.size() - 1) {
output.append("\n\n\n");
currentIdx += 3;
}
}
setEnd(currentIdx);
return output.toString();
}
/**
* Returns a XML representation of this document by going recursively
* through the underlying page structure and calling their respective toXML
* methods.
*
* @return a string containing a XML representation of this Document.
*/
public String toXML() {
final StringBuilder output = new StringBuilder();
output.append("\n");
for (int i = 0; i < content.size(); i++) {
output.append(content.get(i).toXML());
if (i < content.size() - 1) {
output.append("\n");
}
}
output.append("\n ");
return output.toString();
}
}