
gov.nasa.pds.tools.util.DocumentUtil Maven / Gradle / Ivy
// Copyright 2021, by the California Institute of Technology.
// ALL RIGHTS RESERVED. United States Government Sponsorship acknowledged.
// Any commercial use must be negotiated with the Office of Technology Transfer
// at the California Institute of Technology.
//
// This software is subject to U. S. export control laws and regulations
// (22 C.F.R. 120-130 and 15 C.F.R. 730-774). To the extent that the software
// is subject to U.S. export control laws and regulations, the recipient has
// the responsibility to obtain export licenses or other export authority as
// may be required before exporting such information to foreign countries or
// providing access to foreign nationals.
//
package gov.nasa.pds.tools.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import gov.nasa.pds.tools.validate.ProblemType;
/**
* Util class to parse and remove comments from a Document object. It will also keep a mapping of
* document type to ProblemType to allow the retrieval of ProblemType based on document type.
*
*/
public class DocumentUtil {
private static final Logger LOG = LoggerFactory.getLogger(DocumentUtil.class);
// The following two lists are to keep track of a map from document type to
// ProblemType enum.
// They are assigned to an empty list to avoid null pointer exception and will
// be initialized in the initialize() function.
private ArrayList docTypeList = new ArrayList<>();
private ArrayList problemTypeList = new ArrayList<>();
private boolean classInitialized = false; // Indicate whether if the mapping has been initialized
// or not.
public DocumentUtil() {
// Because this class can be instantiated by other classes many times, we only
// want to call the initialize() function
// if we will be using the getProblemType() function below.
}
private void initialize() {
// Create a map to go from a document type to a ProblemType.
// The map consists of two arrays, one to hold the docType and one to hold the
// enumerated ProblemType.
// These keys below (of type String) do not have to be exact. When the key is
// searched for, we will use contains() and ignore case to find the ProblemType
// Example:
// The value "postscript" can be found in "encapsulated postscript" and
// "postscript"
// Empty the lists in case they have content. This is important if somehow the
// function initialize() gets call multiple many times.
this.docTypeList = new ArrayList<>();
this.problemTypeList = new ArrayList<>();
// The add() function should be called in the order specified since together
// they form a mapping mechanism,
// for example 'ENCAPSULATED' before 'POSTSCRIPT' and 'RICH' before 'TEXT'.
this.docTypeList.add("ENCAPSULATED");
this.problemTypeList.add(ProblemType.NON_ENCAPSULATED_POSTSCRIPT_FILE);
this.docTypeList.add("EXCEL");
this.problemTypeList.add(ProblemType.NON_MSEXCEL_FILE);
this.docTypeList.add("GIF");
this.problemTypeList.add(ProblemType.NON_GIF_FILE);
this.docTypeList.add("HTML");
this.problemTypeList.add(ProblemType.NON_HTML_FILE);
this.docTypeList.add("LATEX");
this.problemTypeList.add(ProblemType.NON_LATEX_FILE);
this.docTypeList.add("MPEG");
this.problemTypeList.add(ProblemType.NON_MP4_FILE);
this.docTypeList.add("POSTSCRIPT");
this.problemTypeList.add(ProblemType.NON_POSTSCRIPT_FILE);
this.docTypeList.add("TIFF");
this.problemTypeList.add(ProblemType.NON_TIFF_FILE);
this.docTypeList.add("WORD");
this.problemTypeList.add(ProblemType.NON_MSWORD_FILE);
this.classInitialized = true;
LOG.debug("initialize:this.docTypeList.size {}", this.docTypeList.size());
LOG.debug("initialize:this.problemTypeList.size {}", this.problemTypeList.size());
}
/**
* Returns the enum ProblemType based on the docType.
*
* @param docType The string represent the document type.
*
* @return problemType The matching ProblemType based on the document type. Can be null if not
* matching ProblemType can be found.
*/
public ProblemType getProblemType(String docType) {
ProblemType problemType = null;
if (!this.classInitialized) {
// Only initialize this class once of the two lists' content.
this.initialize();
}
// Iterating through docTypeList and check if docType contains singleDocType.
// Note that everything is changed to lower cases for comparison.
int ii = 0;
for (String singleDocType : this.docTypeList) {
if (docType.toLowerCase().contains(singleDocType.toLowerCase())) {
problemType = this.problemTypeList.get(ii);
// Once we have found a matching value, there's no need to continue looping as
// it will be fetching the wrong ProblemType if we continue.
break;
}
ii++;
}
LOG.debug("getProblemType:docType,problemType {},{}", docType, problemType);
return (problemType);
}
private void removeComments(Node node) {
// Remove comments from the given node.
for (int i = 0; i < node.childNodeSize();) {
Node child = node.childNode(i);
if (child.nodeName().equals("#comment")) {
child.remove();
} else {
removeComments(child);
i++;
}
}
}
/**
* Read the content of the file and returns the content of the file as String.
*
* @param fileUrl The URL of the file.
* @return The content of the file as String.
*/
public String readFile(URL fileUrl) {
BufferedReader reader = null;
// Note: The function FilenameUtils.getPath() doesn't seem to work correctly.
// It returns the path without the leading slash '/':
//
// For this URI
//
// file:/home/qchau/sandbox/validate/src/test/resources/github367/document/
//
// The FilenameUtils.getPath(getTarget().getPath()) returns
//
// home/qchau/sandbox/validate/src/test/resources/github367/document/
//
// which is missing the leading slash.
//
// Using alternative method to get the parent.
String parent = "";
if (FileUtils.toFile(fileUrl).getPath().lastIndexOf("/") < 0) {
LOG.error("The path does not contain a file separator {}", FileUtils.toFile(fileUrl).getPath());
return (null);
}
parent = FileUtils.toFile(fileUrl).getPath().substring(0, FileUtils.toFile(fileUrl).getPath().lastIndexOf("/"));
LOG.debug("readFile:fileUrl,parent,FilenameUtils.getName(fileUrl) {},{},{}", fileUrl, parent,
FilenameUtils.getName(fileUrl.toString()));
// Combine the parent and the file name together so sonatype-lift won't
// complain.
// https://find-sec-bugs.github.io/bugs.htm#PATH_TRAVERSAL_IN
try {
reader = new BufferedReader(
new FileReader(parent + File.separator + FilenameUtils.getName(fileUrl.toString())));
} catch (FileNotFoundException ex) {
LOG.error("readFile: Cannot find file {}", fileUrl);
ex.printStackTrace();
}
String line = null;
StringBuilder stringBuilder = new StringBuilder();
try {
while ((line = reader.readLine()) != null) {
stringBuilder.append(line + "\n");
}
reader.close();
return stringBuilder.toString();
} catch (IOException ex) {
LOG.error("readFile: Cannot read file {}", fileUrl);
ex.printStackTrace();
}
try {
reader.close(); // Close the resource in case of an exception.
} catch (IOException ex) {
LOG.error("readFile: Cannot close file {}", fileUrl);
ex.printStackTrace();
}
return (null);
}
/**
* Returns the content of the file minus the comments as String.
*
* @param fileUrl The URL of the file.
* @return documentContent The content of the file minus the comments as String.
*/
public String getDocumentWithoutComments(URL fileUrl) {
long start = System.currentTimeMillis();
String documentContent = this.readFile(fileUrl); // Read the file as String.
Document doc = Jsoup.parse(documentContent, "", Parser.xmlParser()); // Parse the file as XML.
removeComments(doc); // Remove any comments.
long finish = System.currentTimeMillis();
long timeElapsed = finish - start;
LOG.debug("getDocumentWithoutComments: timeElapsed (millisecs) {}", timeElapsed);
return (doc.html()); // It doesn't matter that the file is XML. This function html() returns the
// file
// as String.
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy