
io.vertigo.quarto.plugins.publisher.docx.DOCXCleanerProcessor Maven / Gradle / Ivy
/**
* vertigo - simple java starter
*
* Copyright (C) 2013, KleeGroup, [email protected] (http://www.kleegroup.com)
* KleeGroup, Centre d'affaire la Boursidiere - BP 159 - 92357 Le Plessis Robinson Cedex - France
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.vertigo.quarto.plugins.publisher.docx;
import io.vertigo.lang.WrappedException;
import io.vertigo.quarto.publisher.impl.merger.processor.MergerProcessor;
import io.vertigo.quarto.publisher.model.PublisherData;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* Cleaner de xml de fichier DOCX.
* Ce processor effectue plusieurs opérations de rectification du XML d'un fichier DOCX.
* - 1. Nettoyage du XML en fermant les balises
* - 2. Suppression des balises de script
*
* @author adufranne
*/
final class DOCXCleanerProcessor implements MergerProcessor {
private static final String CARRIAGE_RETURN = "\\n";
private static final String EMPTY_STRING = "\\s+";
/** {@inheritDoc} */
@Override
public String execute(final String xmlInput, final PublisherData publisherData) {
final Document xmlDoc = DOCXUtil.loadDOM(xmlInput);
final XPath xpath = DOCXUtil.loadXPath();
try {
removeEmptyTags(xmlDoc, xpath);
handleCarriageReturn(xmlDoc, xpath);
return DOCXUtil.renderXML(xmlDoc);
} catch (final XPathExpressionException e) {
throw new WrappedException(e);
}
}
/**
* Transforme les \n en plusieurs lignes dans le docx.
* NB: visibilité package nécessaire pour les tests!.
* ex :
*
*
* (style)
* < /w:pPr>
* in
* test\ntest\ntest
* out
* < /w:p>
* Doit se transformer en 3 balises , la première contenant le in et un test, la deuxième un test, la troisième
* un test et le out. La balise de style
* doit être conservée dans les 3 nouveaux paragraphes.
* @param xmlDoc Document source
* @param xpath Moteur Xpath
* @throws XPathExpressionException Erreur Xpath
*/
static void handleCarriageReturn(final Document xmlDoc, final XPath xpath) throws XPathExpressionException {
final NodeList nodeList = (NodeList) xpath.evaluate(DOCXUtil.XPATH_TAG_NODES, xmlDoc, XPathConstants.NODESET);
Node node;
String[] lines;
for (int i = 0; i < nodeList.getLength(); i++) { // pour chaque résultat
node = nodeList.item(i);
lines = getNodeTextContent(node);
if (lines.length <= 1) {
continue;
}
doHandleCarriageReturn(lines, node);
}
}
private static String[] getNodeTextContent(final Node node) {
final String[] lines;
final String textContent = node.getLastChild().getTextContent();
if (textContent == null || textContent.isEmpty()) {
return new String[0];
}
lines = textContent.split(CARRIAGE_RETURN);
return lines;
}
private static void doHandleCarriageReturn(final String[] lines, final Node node) {
Node paragraphClone;
Node rowClone;
String line;
Node styleNodeClone;
Node currentPar = node.getParentNode();
final List bufferNodes = new ArrayList<>();
final List removeNodes = new ArrayList<>();
Node saveNode = node;
final Node parentNode = node.getParentNode().getParentNode();
node.getLastChild().setTextContent(lines[0]);
while (saveNode.getNextSibling() != null) { // sauvegarde des noeuds successifs.
saveNode = saveNode.getNextSibling();
removeNodes.add(saveNode);
bufferNodes.add(saveNode.cloneNode(true));
}
removeNodes(removeNodes);
for (int j = 1; j < lines.length; j++) { // pour chaque élément de la ligne splittée.
line = lines[j];
// copie du w:p englobant sans ses fils.
paragraphClone = node.getParentNode().cloneNode(false);
rowClone = node.cloneNode(true);
rowClone.getLastChild().setTextContent(line);
if (DOCXUtil.STYLE_NODE.equals(node.getParentNode().getFirstChild().getNodeName())) { // présence
// d'un
// noeud
// de
// style
styleNodeClone = node.getParentNode().getFirstChild().cloneNode(true);
paragraphClone.appendChild(styleNodeClone);
}
paragraphClone.appendChild(rowClone);
if (currentPar.getNextSibling() == null) { // on a un dernier noeud.
parentNode.appendChild(paragraphClone);
} else { // on n'a pas de dernier noeud.
parentNode.insertBefore(paragraphClone, currentPar.getNextSibling());
}
currentPar = currentPar.getNextSibling();
}
// ajout des noeuds sauvegardés au dernier élément.
if (lines.length > 1) {
for (final Node bufferNode : bufferNodes) {
currentPar.appendChild(bufferNode);
}
}
}
private static void removeNodes(final List removeNodes) {
for (final Node removeNode : removeNodes) {
removeNode.getParentNode().removeChild(removeNode);
}
}
/**
* Enlève les tags d'expression restés vides après traitement.
* @param xmlDoc Document source
* @param xpath Moteur Xpath
* @throws XPathExpressionException Erreur Xpath
*/
private static void removeEmptyTags(final Document xmlDoc, final XPath xpath) throws XPathExpressionException {
final NodeList nodeList = (NodeList) xpath.evaluate("//w:r[w:instrText]", xmlDoc, XPathConstants.NODESET);
Node node;
final Pattern p = Pattern.compile(EMPTY_STRING);
Matcher m;
for (int i = 0; i < nodeList.getLength(); i++) {
node = nodeList.item(i);
m = p.matcher(node.getLastChild().getTextContent());
if (m.matches()) {
node.getParentNode().removeChild(node);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy