org.pageseeder.ox.diffx.step.DiffText Maven / Gradle / Ivy
/*
* Copyright 2021 Allette Systems (Australia)
* http://www.allette.com.au
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.pageseeder.ox.diffx.step;
import org.apache.commons.io.FileUtils;
import org.pageseeder.diffx.algorithm.DiffXAlgorithm;
import org.pageseeder.diffx.config.DiffXConfig;
import org.pageseeder.diffx.config.WhiteSpaceProcessing;
import org.pageseeder.diffx.event.DiffXEvent;
import org.pageseeder.diffx.event.TextEvent;
import org.pageseeder.diffx.format.DiffXFormatter;
import org.pageseeder.diffx.load.text.TextTokenizer;
import org.pageseeder.diffx.load.text.TokenizerByWord;
import org.pageseeder.diffx.sequence.EventSequence;
import org.pageseeder.ox.OXErrors;
import org.pageseeder.ox.api.Result;
import org.pageseeder.ox.api.Step;
import org.pageseeder.ox.api.StepInfo;
import org.pageseeder.ox.core.Model;
import org.pageseeder.ox.core.PackageData;
import org.pageseeder.ox.core.ResultStatus;
import org.pageseeder.ox.diffx.tool.TidyCommand;
import org.pageseeder.ox.diffx.util.DiffXBasic;
import org.pageseeder.ox.step.SimplifyDOCX;
import org.pageseeder.ox.tool.InvalidResult;
import org.pageseeder.ox.tool.ResultBase;
import org.pageseeder.xmlwriter.XMLWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.tidy.Tidy;
import javax.xml.transform.Templates;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import java.io.*;
import java.util.List;
/**
* A Text Comparison processing step
* Step Parameters
*
* - source It could be DOCX, HTML or PSML and will be compared with the target
* - target It could be DOCX, HTML or PSML and will be compared by the source. It the target is not send
* It gets the input.
* - xsl-psml the specified transformation file for manipulated the psml before diff it. (Optional)
* - xsl-html the specified transformation file for manipulated the html before diff it. (Optional)
* - xsl-docx the specified transformation file for manipulated the docx before diff it. (Optional)
*
*
*
* How it works
* In order to identify what kind of file it is. It will use the extension.
* However for docx, it needs this file simplified. Then before call this class, this step must be called {@link SimplifyDOCX}.
* The {@link SimplifyDOCX} will generate the folder simplified, then this folder should be sent instead of the original docx.
*
* @author Carlos Cabral
* @version 12 April 2015
*/
public class DiffText implements Step {
/** The Constant LOGGER. */
private static final Logger LOGGER = LoggerFactory.getLogger(DiffText.class);
/**
* We don't try to compare more that 24K events.
*/
private static final int MAX_EVENTS = 24000;
/* (non-Javadoc)
* @see org.pageseeder.ox.api.Step#process(org.pageseeder.ox.core.Model, org.pageseeder.ox.core.PackageData, org.pageseeder.ox.api.StepInfo)
*/
@Override
public Result process(Model model, PackageData data, StepInfo info) {
LOGGER.debug("Start Diff Text.");
String sourcePath = info.getParameter("source");
String targetPath = info.getParameter("target");
if (isBlank(targetPath)) {
targetPath = info.input();
}
File source = data.getFile(sourcePath);
File target = data.getFile(targetPath);
// Check the source first
if (!valid(source)) return new InvalidResult(model, data).error(new IllegalArgumentException("The source is invalid"));
// Then validate the target
if (!valid(target)) return new InvalidResult(model, data).error(new IllegalArgumentException("The target is invalid"));
//Getting the content(text)
CheckTextResult result = new CheckTextResult(model, data, sourcePath, targetPath);
try {
String charset = data.getProperty("charset", "utf-8");
String sourceText = toText(model, info, source, charset);
String targetText = toText(model, info, target, charset);
result.sourceText = sourceText;
result.targetText = targetText;
processDiff(result);
} catch (IOException | TransformerException ex) {
LOGGER.error("DIFF Text error: " + ex.getMessage());
result.setError(ex);
}
return result;
}
/**
* Valid.
*
* @param file the file
* @return true, if successful
*/
private static boolean valid(File file) {
if (file == null || !file.exists()) {
LOGGER.warn("Cannot find file {} exist {} .", file);
return false;
}
return true;
}
/**
* Checks if is blank.
*
* @param value the value
* @return true, if is blank
*/
private static boolean isBlank(String value) {
return value == null || value.trim().isEmpty();
}
/**
* Gets the template.
*
* @param model the model
* @param info the info
* @param parameterName the parameter name
* @param defaultFileName the default file name
* @return the template
*/
private Templates getTemplate(Model model, StepInfo info, String parameterName, String defaultFileName) {
String value = info.getParameter(parameterName);
Templates template = null;
try {
if (!isBlank(value)) {
template = model.getTemplates(value);
} else {
template = model.getTemplates(defaultFileName);
}
} catch (IOException | TransformerConfigurationException ex) {
LOGGER.error("Cannot find the template {} defined in {} parameter. ", value, parameterName, ex);
}
return template;
}
/**
* Returns the content as plain text.
*
* The docx must be sent simplified (The Simplified Folder).
*
* @param model the model
* @param info the info
* @param file the file pointing to a HTML, DOCX (Simplified Folder) or PSML.
* @param charset the charset
* @return The file as plain text.
* @throws IOException Should an error occur while reading the file.
* @throws TransformerException Should an error occur while tranforming the content.
*/
private String toText(Model model, StepInfo info, File file, String charset) throws IOException, TransformerException {
String text = null;
if (file.getName().toLowerCase().endsWith("simplified")) {
text = toDOCXText(model, info, file);
} else if (file.getName().toLowerCase().endsWith("html")) {
text = toHTMLText(model, info, file, charset);
} else if (file.getName().toLowerCase().endsWith("psml")) {
text = toPSMLText(model, info, file, charset);
}
return text;
}
/**
* Returns the PSML as plain text.
*
* @param model the model
* @param info the info
* @param psml the file pointing to psml document.
* @param charset the charset
* @return The file as plain text.
* @throws IOException Should an error occur while reading the file.
* @throws TransformerException Should an error occur while tranforming the content.
*/
private String toPSMLText(Model model, StepInfo info, File psml, String charset) throws IOException, TransformerException {
LOGGER.debug("Getting text for PSML.");
String source = FileUtils.readFileToString(psml, charset);
// Convert to plain text
StringWriter text = new StringWriter();
Templates templates = getTemplate(model, info, "xsl-psml", "psml-text.xsl");
Transformer transformer = templates.newTransformer();
transformer.transform(new StreamSource(new StringReader(source)), new StreamResult(text));
return text.toString();
}
/**
* Returns the HTML as plain text.
*
* @param model the model
* @param info the info
* @param html the file pointing to html document.
* @param charset the charset
* @return The file as plain text.
* @throws IOException Should an error occur while reading the file.
* @throws TransformerException Should an error occur while tranforming the content.
*/
private String toHTMLText(Model model, StepInfo info, File html, String charset) throws IOException, TransformerException {
LOGGER.debug("Getting text for HTML.");
// We may need to run tidy first
Tidy tidy = TidyCommand.newTidy(model);
// tidy.setOnlyErrors(true);
// tidy.setShowErrors(0);
StringWriter buffer = new StringWriter();
String source = FileUtils.readFileToString(html, charset);
source = source.replaceAll("<\\?xml(.*?)\\>", "");
tidy.parse(new StringReader(source), buffer);
String xhtml = buffer.toString();
// We must remove the XHTML DOCTYPE declaration since W3 has shutdown its servers for XHTML DTDs
xhtml = xhtml.replaceAll("<\\!DOCTYPE(.*?)\\>\n?", "");
// Convert to plain text
StringWriter text = new StringWriter();
Templates templates = getTemplate(model, info, "xsl-html", "html-text.xsl");
Transformer transformer = templates.newTransformer();
transformer.transform(new StreamSource(new StringReader(xhtml)), new StreamResult(text));
return text.toString();
}
/**
* Returns the main document part as plain text.
*
* @param model the model
* @param info the info
* @param simplified the simplified
* @return The file as plain text.
* @throws IOException Should an error occur while reading the file.
* @throws TransformerException Should an error occur while tranforming the content.
*/
private String toDOCXText(Model model, StepInfo info, File simplified) throws IOException, TransformerException {
LOGGER.debug("Getting text for DOCX.");
//The word file that has the content.
File documentXML = new File(simplified, "/word/document.xml");
//Check if the content file exist
if (documentXML == null || !documentXML.exists()) {
LOGGER.error("DOCX - document.xml was not found: " + simplified.getAbsolutePath() + "/word/document.xml");
throw new FileNotFoundException("DOCX - document.xml was not found.");
}
//Extract the text.
StringWriter text = new StringWriter();
Templates templates = getTemplate(model, info, "xsl-docx", "docx-text.xsl");
Transformer transformer = templates.newTransformer();
transformer.transform(new StreamSource(documentXML), new StreamResult(text));
return text.toString();
}
/**
* Check the differences between result.sourceText and result.targetText and
* then update the result.
*
* @param result the result
* @return the check text result
* @throws IllegalStateException the illegal state exception
* @throws IOException Signals that an I/O exception has occurred.
*/
private void processDiff(CheckTextResult result) throws IllegalStateException, IOException{
//Start the comparison logic
TextTokenizer tokenizer = new TokenizerByWord(WhiteSpaceProcessing.IGNORE);
List htmlEvents = tokenizer.tokenize(result.sourceText);
List docxEvents = tokenizer.tokenize(result.targetText);
EventSequence htmlSequence = toSequence(htmlEvents);
EventSequence docxSequence = toSequence(docxEvents);
DiffXAlgorithm algorithm = new DiffXBasic(htmlSequence, docxSequence);
StringWriter diff = new StringWriter();
TextDiffxFormatter formatter = new TextDiffxFormatter(diff);
algorithm.process(formatter);
formatter.checkClose();
result.diffXML = diff.toString().replaceAll("<\\?xml(.*?)\\>", "");
result.setStatus(formatter.hasDiff() ? ResultStatus.ERROR : ResultStatus.OK);
}
/**
* To sequence.
*
* @param events the events
* @return the event sequence
*/
private EventSequence toSequence(List events) {
int to = Math.min(events.size(), MAX_EVENTS);
EventSequence sequence = new EventSequence(events.size());
for (int i = 0; i < to; i++) {
sequence.addEvent(events.get(i));
}
return sequence;
}
/* **********************************************************************************************
* INNER CLASSES
* **********************************************************************************************/
/**
* The Class CheckTextResult.
*
* @author Christophe Lauret
* @version 28 October 2013
*/
private final class CheckTextResult extends ResultBase implements Result {
/** The source text. */
private String sourceText = "";
/** The target text. */
private String targetText = "";
/** The diff xml. */
private String diffXML = "";
/**
* The path to the source file within the package.
*/
private final String _sourcePath;
/**
* The path to the target(main) document file within the package.
*/
private final String _targetPath;
/**
* Instantiates a new check text result.
*
* @param model the model
* @param data the data
* @param sourcePath the source path
* @param targetPath the target path
*/
public CheckTextResult(Model model, PackageData data, String sourcePath, String targetPath) {
super(model, data);
this._targetPath = targetPath;
this._sourcePath = sourcePath;
}
/* (non-Javadoc)
* @see com.topologi.diffx.xml.XMLWritable#toXML(com.topologi.diffx.xml.XMLWriter)
*/
@Override
public void toXML(XMLWriter xml) throws IOException {
xml.openElement("result", true);
xml.attribute("name", "diff-text");
xml.attribute("id", data().id());
xml.attribute("model", model().name());
xml.attribute("status", status().toString().toLowerCase());
xml.attribute("time", Long.toString(time()));
xml.attribute("downloadable", String.valueOf(isDownloadable()));
// Source document
xml.openElement("source");
xml.attribute("path", this._sourcePath);
xml.writeCDATA(this.sourceText);
xml.closeElement();
// Target document
xml.openElement("target");
xml.attribute("path", this._targetPath);
xml.writeCDATA(this.targetText);
xml.closeElement();
// Settings specified
xml.openElement("diff");
xml.writeXML(this.diffXML);
xml.closeElement();
// Print the details of any error
if (error() != null) {
OXErrors.toXML(error(), xml, true);
}
xml.closeElement();
}
/* (non-Javadoc)
* @see org.pageseeder.ox.tool.ResultBase#isDownloadable()
*/
@Override
public boolean isDownloadable() {
return false;
}
}
/**
* The Class TextDiffxFormatter.
*/
public static class TextDiffxFormatter implements DiffXFormatter {
/** Diff output. */
final Writer w;
/** The state of the previous event (-1 = delete, 0 = format, 1 = insert). */
private int last = 0;
/**
* State variable set to true
if a difference is detected.
*/
private boolean hasDiff = false;
/**
* Instantiates a new text diffx formatter.
*
* @param w the w
*/
public TextDiffxFormatter(Writer w) {
this.w = w;
}
/* (non-Javadoc)
* @see com.topologi.diffx.format.DiffXFormatter#setConfig(com.topologi.diffx.config.DiffXConfig)
*/
@Override
public void setConfig(DiffXConfig config) {}
/* (non-Javadoc)
* @see com.topologi.diffx.format.DiffXFormatter#format(com.topologi.diffx.event.DiffXEvent)
*/
@Override
public void format(DiffXEvent e) throws IOException, IllegalStateException {
checkClose();
this.w.write(e.toXML());
this.w.write(' ');
this.last = 0;
}
/* (non-Javadoc)
* @see com.topologi.diffx.format.DiffXFormatter#delete(com.topologi.diffx.event.DiffXEvent)
*/
@Override
public void delete(DiffXEvent e) throws IOException, IllegalStateException {
if (this.last > 0) {
this.w.write("");
}
if (this.last != -1) {
this.w.write("");
}
this.w.write(e.toXML());
this.w.write(' ');
this.last = -1;
this.hasDiff = true;
}
/* (non-Javadoc)
* @see com.topologi.diffx.format.DiffXFormatter#insert(com.topologi.diffx.event.DiffXEvent)
*/
@Override
public void insert(DiffXEvent e) throws IOException, IllegalStateException {
if (this.last < 0) {
this.w.write("");
}
if (this.last != 1) {
this.w.write("");
}
this.w.write(e.toXML());
this.w.write(' ');
this.last = 1;
this.hasDiff = true;
}
/**
* Check close.
*
* @throws IOException Signals that an I/O exception has occurred.
* @throws IllegalStateException the illegal state exception
*/
public void checkClose() throws IOException, IllegalStateException {
if (this.last > 0) {
this.w.write("");
}
if (this.last < 0) {
this.w.write("");
}
}
/**
* Checks for diff.
*
* @return the hasDiff
*/
public boolean hasDiff() {
return this.hasDiff;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy