de.citec.scie.pdf.structure.Document Maven / Gradle / Ivy

/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package de.citec.scie.pdf.structure;

import java.util.ArrayList;
import java.util.Objects;

/**
 * This represents a parsed document which is defined as a sequence of pages.
 *
 * @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
 */
public class Document extends LineSegment {

	public final ArrayList content = new ArrayList<>();

	public Document() {
	}

	/**
	 * {@inheritDoc }
	 */
	@Override
	public int hashCode() {
		int hash = 7;
		hash = 83 * hash + Objects.hashCode(this.content);
		return hash;
	}

	/**
	 * {@inheritDoc }
	 */
	@Override
	public boolean equals(Object obj) {
		if (obj == null) {
			return false;
		}
		if (getClass() != obj.getClass()) {
			return false;
		}
		final Document other = (Document) obj;
		if (!Objects.equals(this.content, other.content)) {
			return false;
		}
		return true;
	}

	/**
	 * Converts this object to a string by going recursively through the
	 * underlying page structure and calling their respective toString
	 * methods.
	 *
	 * @return a string showing the plain text content of this Document.
	 */
	@Override
	public String toString() {
		final StringBuilder output = new StringBuilder();
		for (int i = 0; i < content.size(); i++) {
			output.append(content.get(i).toString());
			if (i < content.size() - 1) {
				output.append("\n\n\n");
			}
		}
		return output.toString();
	}

	/**
	 * Does the same as toString but also inserts the beginning and end index of
	 * each objects respective text representation to this objects
	 * attributes (which is retrievable by getBegin and getEnd).
	 *
	 * @param currentIdx the current index in the plain text representation.
	 * If you are calling this as a user you should insert 0 here.
	 * @return the plainText representation of this Document, same as for the
	 * toString method.
	 */
	public String indexedToString(int currentIdx) {
		setBegin(currentIdx);
		final StringBuilder output = new StringBuilder();
		for (int i = 0; i < content.size(); i++) {
			final String pageStr = content.get(i).indexedToString(currentIdx);
			output.append(pageStr);
			currentIdx += pageStr.length();
			if (i < content.size() - 1) {
				output.append("\n\n\n");
				currentIdx += 3;
			}
		}
		setEnd(currentIdx);
		return output.toString();
	}

	/**
	 * Returns a XML representation of this document by going recursively
	 * through the underlying page structure and calling their respective toXML
	 * methods.
	 *
	 * @return a string containing a XML representation of this Document.
	 */
	public String toXML() {
		final StringBuilder output = new StringBuilder();
		output.append("\n");
		for (int i = 0; i < content.size(); i++) {
			output.append(content.get(i).toXML());
			if (i < content.size() - 1) {
				output.append("\n");
			}
		}
		output.append("\n");
		return output.toString();
	}
}