org.docx4j.anon.Anonymize Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of docx4j Show documentation
Show all versions of docx4j Show documentation
docx4j is a library which helps you to work with the Office Open
XML file format as used in docx
documents, pptx presentations, and xlsx spreadsheets.
package org.docx4j.anon;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.codec.binary.Base64;
import org.docx4j.TraversalUtil;
import org.docx4j.XmlUtils;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.exceptions.InvalidFormatException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.JaxbXmlPart;
import org.docx4j.openpackaging.parts.Part;
import org.docx4j.openpackaging.parts.PartName;
import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPart;
import org.docx4j.openpackaging.parts.WordprocessingML.FooterPart;
import org.docx4j.openpackaging.parts.WordprocessingML.HeaderPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageBmpPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageGifPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageJpegPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImagePngPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageTiffPart;
import org.docx4j.relationships.Relationship;
import org.docx4j.wml.CTLanguage;
import org.docx4j.wml.RPr;
import org.docx4j.wml.Styles;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Anonymize {
private static Logger log = LoggerFactory.getLogger(Anonymize.class);
public Anonymize(WordprocessingMLPackage wordMLPackage) {
this.pkg = wordMLPackage;
result = new AnonymizeResult();
}
private WordprocessingMLPackage pkg;
ScrambleText latinizer = null;
DmlVmlAnalyzer dmlVmlAnalyzer = null;
AnonymizeResult result;
// We'll replace images with 2x2 pixels
private static byte[] PNG_IMAGE_DATA;
private static byte[] GIF_IMAGE_DATA;
private static byte[] JPEG_IMAGE_DATA;
private static byte[] BMP_IMAGE_DATA;
private static byte[] TIF_IMAGE_DATA;
static {
PNG_IMAGE_DATA = Base64.decodeBase64("iVBORw0KGgoAAAANSUhEUgAAAAIAAAACAgMAAAAP2OW3AAAADFBMVEUDAP//AAAA/wb//AAD4Tw1AAAACXBIWXMAAAsTAAALEwEAmpwYAAAADElEQVQI12NwYNgAAAF0APHJnpmVAAAAAElFTkSuQmCC");
GIF_IMAGE_DATA = Base64.decodeBase64("R0lGODdhAgACAKEEAAMA//8AAAD/Bv/8ACwAAAAAAgACAAACAww0BQA7");
JPEG_IMAGE_DATA = Base64.decodeBase64(
"/9j/4AAQSkZJRgABAQEASABIAAD/4QCMRXhpZgAATU0AKgAAAAgABwEaAAUAAAABAAAAYgEbAAUA"
+"AAABAAAAagEoAAMAAAABAAIAAAExAAIAAAASAAAAclEQAAEAAAABAQAAAFERAAQAAAABAAALE1ES"
+"AAQAAAABAAALEwAAAAAAARlIAAAD6AABGUgAAAPoUGFpbnQuTkVUIHYzLjUuMTAA/9sAQwACAQEC"
+"AQECAgICAgICAgMFAwMDAwMGBAQDBQcGBwcHBgcHCAkLCQgICggHBwoNCgoLDAwMDAcJDg8NDA4L"
+"DAwM/9sAQwECAgIDAwMGAwMGDAgHCAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwM"
+"DAwMDAwMDAwMDAwMDAwM/8AAEQgAAgACAwEiAAIRAQMRAf/EAB8AAAEFAQEBAQEBAAAAAAAAAAAB"
+"AgMEBQYHCAkKC//EALUQAAIBAwMCBAMFBQQEAAABfQECAwAEEQUSITFBBhNRYQcicRQygZGhCCNC"
+"scEVUtHwJDNicoIJChYXGBkaJSYnKCkqNDU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0"
+"dXZ3eHl6g4SFhoeIiYqSk5SVlpeYmZqio6Slpqeoqaqys7S1tre4ubrCw8TFxsfIycrS09TV1tfY"
+"2drh4uPk5ebn6Onq8fLz9PX29/j5+v/EAB8BAAMBAQEBAQEBAQEAAAAAAAABAgMEBQYHCAkKC//E"
+"ALURAAIBAgQEAwQHBQQEAAECdwABAgMRBAUhMQYSQVEHYXETIjKBCBRCkaGxwQkjM1LwFWJy0QoW"
+"JDThJfEXGBkaJicoKSo1Njc4OTpDREVGR0hJSlNUVVZXWFlaY2RlZmdoaWpzdHV2d3h5eoKDhIWG"
+"h4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uLj5OXm5+jp"
+"6vLz9PX29/j5+v/aAAwDAQACEQMRAD8A/QL4W/sD/ArXfhl4dvr74K/CW8vbzS7ae4uJ/CGnySzy"
+"NErM7sYiWYkkknkk0UUV5+U/7jR/wR/JHRP4mf/Z");
BMP_IMAGE_DATA = Base64.decodeBase64("Qk1GAAAAAAAAADYAAAAoAAAAAgAAAAIAAAABABgAAAAAAAAAAAATCwAAEwsAAAAAAAAAAAAABv8AAPz///8AAP//AAMD/w==");
TIF_IMAGE_DATA = Base64.decodeBase64("");
}
/*
*
* TODO: Info leakage via external parts eg hyperlinks
*
*/
public AnonymizeResult go() throws Docx4JException {
filterMDPRels();
handleMetadata();
result.unsafeParts = PartsAnalyzer.identifyUnsafeParts(pkg.getParts().getParts().entrySet());
detectDmlVmlContent(); // doing this before scramble lets us analyze field types
/* content stories:
*
* - MDP
* - Header/Footer
* - Footnotes/Endnotes
* - Comments
*
* replace with latin text */
applyScrambleCallbackToParts();
// Next, images
handleImages();
return result;
}
/**
* Remove customXml, glossaryDocument, and stylesWithEffects
*/
private void filterMDPRels() {
if (pkg.getMainDocumentPart().getRelationshipsPart()==null) return;
if (log.isDebugEnabled()) {
for (Relationship r : pkg.getMainDocumentPart().getRelationshipsPart().getRelationships().getRelationship()) {
System.out.println(r.getType());
}
}
pkg.getMainDocumentPart().getRelationshipsPart().removeRelationshipsByType(
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXml");
pkg.getMainDocumentPart().getRelationshipsPart().removeRelationshipsByType(
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/glossaryDocument");
pkg.getMainDocumentPart().getRelationshipsPart().removeRelationshipsByType(
"http://schemas.microsoft.com/office/2007/relationships/stylesWithEffects");
}
/**
* @throws Docx4JException
*/
protected void handleMetadata() throws Docx4JException {
// docProps/app.xml (Extended Properties)
if (pkg.getDocPropsExtendedPart()!=null
&& pkg.getDocPropsExtendedPart().getContents()!=null) {
pkg.getDocPropsExtendedPart().getContents().setCompany(null);
pkg.getDocPropsExtendedPart().getContents().setManager(null);
pkg.getDocPropsExtendedPart().getContents().setHeadingPairs(null);
}
if (pkg.getDocPropsCorePart()!=null
&& pkg.getDocPropsCorePart().getContents()!=null) {
pkg.getDocPropsCorePart().getContents().setCategory(null);
pkg.getDocPropsCorePart().getContents().setCreator(null);
pkg.getDocPropsCorePart().getContents().setDescription(null);
pkg.getDocPropsCorePart().getContents().setIdentifier(null);
pkg.getDocPropsCorePart().getContents().setKeywords(null);
pkg.getDocPropsCorePart().getContents().setSubject(null);
pkg.getDocPropsCorePart().getContents().setTitle(null);
}
if (pkg.getDocPropsCustomPart()!=null) {
// Remove this
pkg.getRelationshipsPart().removePart(pkg.getDocPropsCustomPart().getPartName());
}
// /docProps/thumbnail.emf, always delete this!
pkg.getRelationshipsPart().removePart(new PartName("/docProps/thumbnail.emf"));
}
/**
* This method replaces images with 2x2 pixels (which Word scales appropriately)
*
* @throws InvalidFormatException
*/
private void handleImages()
throws InvalidFormatException {
// Apply map to headers/footers
for (Entry entry : pkg.getParts().getParts().entrySet()) {
Part p = entry.getValue();
if (p instanceof ImagePngPart
|| p instanceof ImageGifPart
|| p instanceof ImageJpegPart
|| p instanceof ImageBmpPart
|| p instanceof ImageTiffPart
// Others treated as unsafe
) {
((BinaryPart)p).setBinaryData(PNG_IMAGE_DATA);
}
}
}
private void detectDmlVmlContent()
throws InvalidFormatException {
dmlVmlAnalyzer = new DmlVmlAnalyzer();
// Apply map to MDP
detectDmlVml( pkg.getMainDocumentPart() );
// Apply map to headers/footers
for (Entry entry : pkg.getParts().getParts().entrySet()) {
Part p = entry.getValue();
if (p instanceof HeaderPart) {
detectDmlVml( (HeaderPart)p );
}
if (p instanceof FooterPart) {
detectDmlVml( (FooterPart)p );
}
}
// endnotes/footnotes
if (pkg.getMainDocumentPart().getFootnotesPart()!=null) {
detectDmlVml( pkg.getMainDocumentPart().getFootnotesPart() );
}
if (pkg.getMainDocumentPart().getEndNotesPart()!=null) {
detectDmlVml( pkg.getMainDocumentPart().getEndNotesPart() );
}
// Comments
if (pkg.getMainDocumentPart().getCommentsPart()!=null) {
detectDmlVml( pkg.getMainDocumentPart().getCommentsPart() );
}
return;
}
public void detectDmlVml(JaxbXmlPart p) {
log.info("\n\n Inspecting " + p.getPartName().getName());
dmlVmlAnalyzer.reinit();
dmlVmlAnalyzer.setPart(p);
new TraversalUtil(p.getJaxbElement(), dmlVmlAnalyzer);
result.unsafeObjectsByPart.put(p, dmlVmlAnalyzer.unsafeObjects);
if (dmlVmlAnalyzer.unsafeObjects.size()>0){
result.anyUnsafeObjects = true;
}
result.inventoryObjectsByPart.put(p, dmlVmlAnalyzer.inventoryObjects);
if (!result.containsVML) {
result.containsVML = dmlVmlAnalyzer.containsVML;
}
result.fieldsPresent = this.dmlVmlAnalyzer.fieldsPresent;
}
private void applyScrambleCallbackToParts()
throws InvalidFormatException {
try {
latinizer = new ScrambleText(pkg);
// Apply map to MDP
scramble( pkg.getMainDocumentPart() );
// Apply map to headers/footers
for (Entry entry : pkg.getParts().getParts().entrySet()) {
Part p = entry.getValue();
if (p instanceof HeaderPart) {
scramble( (HeaderPart)p );
}
if (p instanceof FooterPart) {
scramble( (FooterPart)p );
}
}
// endnotes/footnotes
if (pkg.getMainDocumentPart().getFootnotesPart()!=null) {
scramble( pkg.getMainDocumentPart().getFootnotesPart() );
}
if (pkg.getMainDocumentPart().getEndNotesPart()!=null) {
scramble( pkg.getMainDocumentPart().getEndNotesPart() );
}
// Comments
if (pkg.getMainDocumentPart().getCommentsPart()!=null) {
scramble( pkg.getMainDocumentPart().getCommentsPart() );
}
// collected at the package level
// result.mostPopularLang = latinizer.langFromLangStats();
result.hasGreek = latinizer.hasGreek;
result.hasCyrillic = latinizer.hasCyrillic;
result.hasHebrew = latinizer.hasHebrew;
result.hasArabic = latinizer.hasArabic;
result.hasHiragana = latinizer.hasHiragana;
result.hasKatakana = latinizer.hasKatakana;
result.hasCJK = latinizer.hasCJK;
return;
} catch (Exception e) {
e.printStackTrace();
throw new InvalidFormatException(e.getMessage(), e);
}
}
public void scramble(JaxbXmlPart p) {
log.info("\n\n Scrambling " + p.getPartName().getName());
new TraversalUtil(p.getJaxbElement(), latinizer);
}
}