
org.docx4j.anon.Anonymize Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of docx4j-docx-anon Show documentation
Show all versions of docx4j-docx-anon Show documentation
anonymization of docx files
package org.docx4j.anon;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.codec.binary.Base64;
import org.docx4j.TraversalUtil;
import org.docx4j.XmlUtils;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.exceptions.InvalidFormatException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.JaxbXmlPart;
import org.docx4j.openpackaging.parts.Part;
import org.docx4j.openpackaging.parts.PartName;
import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPart;
import org.docx4j.openpackaging.parts.WordprocessingML.FooterPart;
import org.docx4j.openpackaging.parts.WordprocessingML.HeaderPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageBmpPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageGifPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageJpegPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImagePngPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageTiffPart;
import org.docx4j.relationships.Relationship;
import org.docx4j.wml.CTLanguage;
import org.docx4j.wml.RPr;
import org.docx4j.wml.Styles;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Anonymize {
private static Logger log = LoggerFactory.getLogger(Anonymize.class);
public Anonymize(WordprocessingMLPackage wordMLPackage) {
this.pkg = wordMLPackage;
result = new AnonymizeResult();
}
private WordprocessingMLPackage pkg;
ScrambleText latinizer = null;
DmlVmlAnalyzer dmlVmlAnalyzer = null;
AnonymizeResult result;
// We'll replace images with 2x2 pixels
private static byte[] PNG_IMAGE_DATA;
private static byte[] GIF_IMAGE_DATA;
private static byte[] JPEG_IMAGE_DATA;
private static byte[] BMP_IMAGE_DATA;
private static byte[] TIF_IMAGE_DATA;
static {
PNG_IMAGE_DATA = Base64.decodeBase64("iVBORw0KGgoAAAANSUhEUgAAAAIAAAACAgMAAAAP2OW3AAAADFBMVEUDAP//AAAA/wb//AAD4Tw1AAAACXBIWXMAAAsTAAALEwEAmpwYAAAADElEQVQI12NwYNgAAAF0APHJnpmVAAAAAElFTkSuQmCC");
GIF_IMAGE_DATA = Base64.decodeBase64("R0lGODdhAgACAKEEAAMA//8AAAD/Bv/8ACwAAAAAAgACAAACAww0BQA7");
JPEG_IMAGE_DATA = Base64.decodeBase64(
"/9j/4AAQSkZJRgABAQEASABIAAD/4QCMRXhpZgAATU0AKgAAAAgABwEaAAUAAAABAAAAYgEbAAUA"
+"AAABAAAAagEoAAMAAAABAAIAAAExAAIAAAASAAAAclEQAAEAAAABAQAAAFERAAQAAAABAAALE1ES"
+"AAQAAAABAAALEwAAAAAAARlIAAAD6AABGUgAAAPoUGFpbnQuTkVUIHYzLjUuMTAA/9sAQwACAQEC"
+"AQECAgICAgICAgMFAwMDAwMGBAQDBQcGBwcHBgcHCAkLCQgICggHBwoNCgoLDAwMDAcJDg8NDA4L"
+"DAwM/9sAQwECAgIDAwMGAwMGDAgHCAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwM"
+"DAwMDAwMDAwMDAwMDAwM/8AAEQgAAgACAwEiAAIRAQMRAf/EAB8AAAEFAQEBAQEBAAAAAAAAAAAB"
+"AgMEBQYHCAkKC//EALUQAAIBAwMCBAMFBQQEAAABfQECAwAEEQUSITFBBhNRYQcicRQygZGhCCNC"
+"scEVUtHwJDNicoIJChYXGBkaJSYnKCkqNDU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0"
+"dXZ3eHl6g4SFhoeIiYqSk5SVlpeYmZqio6Slpqeoqaqys7S1tre4ubrCw8TFxsfIycrS09TV1tfY"
+"2drh4uPk5ebn6Onq8fLz9PX29/j5+v/EAB8BAAMBAQEBAQEBAQEAAAAAAAABAgMEBQYHCAkKC//E"
+"ALURAAIBAgQEAwQHBQQEAAECdwABAgMRBAUhMQYSQVEHYXETIjKBCBRCkaGxwQkjM1LwFWJy0QoW"
+"JDThJfEXGBkaJicoKSo1Njc4OTpDREVGR0hJSlNUVVZXWFlaY2RlZmdoaWpzdHV2d3h5eoKDhIWG"
+"h4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uLj5OXm5+jp"
+"6vLz9PX29/j5+v/aAAwDAQACEQMRAD8A/QL4W/sD/ArXfhl4dvr74K/CW8vbzS7ae4uJ/CGnySzy"
+"NErM7sYiWYkkknkk0UUV5+U/7jR/wR/JHRP4mf/Z");
BMP_IMAGE_DATA = Base64.decodeBase64("Qk1GAAAAAAAAADYAAAAoAAAAAgAAAAIAAAABABgAAAAAAAAAAAATCwAAEwsAAAAAAAAAAAAABv8AAPz///8AAP//AAMD/w==");
TIF_IMAGE_DATA = Base64.decodeBase64("");
}
/*
*
* TODO: Info leakage via external parts eg hyperlinks
*
*/
public AnonymizeResult go() throws Docx4JException {
filterMDPRels();
handleMetadata();
result.unsafeParts = PartsAnalyzer.identifyUnsafeParts(pkg.getParts().getParts().entrySet());
detectDmlVmlContent(); // doing this before scramble lets us analyze field types
/* content stories:
*
* - MDP
* - Header/Footer
* - Footnotes/Endnotes
* - Comments
*
* replace with latin text */
applyScrambleCallbackToParts();
// Next, images
handleImages();
return result;
}
/**
* Remove customXml, glossaryDocument, and stylesWithEffects
*/
private void filterMDPRels() {
if (pkg.getMainDocumentPart().getRelationshipsPart()==null) return;
if (log.isDebugEnabled()) {
for (Relationship r : pkg.getMainDocumentPart().getRelationshipsPart().getRelationships().getRelationship()) {
System.out.println(r.getType());
}
}
pkg.getMainDocumentPart().getRelationshipsPart().removeRelationshipsByType(
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXml");
pkg.getMainDocumentPart().getRelationshipsPart().removeRelationshipsByType(
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/glossaryDocument");
pkg.getMainDocumentPart().getRelationshipsPart().removeRelationshipsByType(
"http://schemas.microsoft.com/office/2007/relationships/stylesWithEffects");
}
/**
* @throws Docx4JException
*/
protected void handleMetadata() throws Docx4JException {
// docProps/app.xml (Extended Properties)
if (pkg.getDocPropsExtendedPart()!=null
&& pkg.getDocPropsExtendedPart().getContents()!=null) {
pkg.getDocPropsExtendedPart().getContents().setCompany(null);
pkg.getDocPropsExtendedPart().getContents().setManager(null);
pkg.getDocPropsExtendedPart().getContents().setHeadingPairs(null);
}
if (pkg.getDocPropsCorePart()!=null
&& pkg.getDocPropsCorePart().getContents()!=null) {
pkg.getDocPropsCorePart().getContents().setCategory(null);
pkg.getDocPropsCorePart().getContents().setCreator(null);
pkg.getDocPropsCorePart().getContents().setDescription(null);
pkg.getDocPropsCorePart().getContents().setIdentifier(null);
pkg.getDocPropsCorePart().getContents().setKeywords(null);
pkg.getDocPropsCorePart().getContents().setSubject(null);
pkg.getDocPropsCorePart().getContents().setTitle(null);
}
if (pkg.getDocPropsCustomPart()!=null) {
// Remove this
pkg.getRelationshipsPart().removePart(pkg.getDocPropsCustomPart().getPartName());
}
// /docProps/thumbnail.emf, always delete this!
pkg.getRelationshipsPart().removePart(new PartName("/docProps/thumbnail.emf"));
}
/**
* This method replaces images with 2x2 pixels (which Word scales appropriately)
*
* @throws InvalidFormatException
*/
private void handleImages()
throws InvalidFormatException {
// Apply map to headers/footers
for (Entry entry : pkg.getParts().getParts().entrySet()) {
Part p = entry.getValue();
if (p instanceof ImagePngPart
|| p instanceof ImageGifPart
|| p instanceof ImageJpegPart
|| p instanceof ImageBmpPart
|| p instanceof ImageTiffPart
// Others treated as unsafe
) {
((BinaryPart)p).setBinaryData(PNG_IMAGE_DATA);
}
}
}
private void detectDmlVmlContent()
throws InvalidFormatException {
dmlVmlAnalyzer = new DmlVmlAnalyzer();
// Apply map to MDP
detectDmlVml( pkg.getMainDocumentPart() );
// Apply map to headers/footers
for (Entry entry : pkg.getParts().getParts().entrySet()) {
Part p = entry.getValue();
if (p instanceof HeaderPart) {
detectDmlVml( (HeaderPart)p );
}
if (p instanceof FooterPart) {
detectDmlVml( (FooterPart)p );
}
}
// endnotes/footnotes
if (pkg.getMainDocumentPart().getFootnotesPart()!=null) {
detectDmlVml( pkg.getMainDocumentPart().getFootnotesPart() );
}
if (pkg.getMainDocumentPart().getEndNotesPart()!=null) {
detectDmlVml( pkg.getMainDocumentPart().getEndNotesPart() );
}
// Comments
if (pkg.getMainDocumentPart().getCommentsPart()!=null) {
detectDmlVml( pkg.getMainDocumentPart().getCommentsPart() );
}
return;
}
public void detectDmlVml(JaxbXmlPart p) {
log.info("\n\n Inspecting " + p.getPartName().getName());
dmlVmlAnalyzer.reinit();
dmlVmlAnalyzer.setPart(p);
new TraversalUtil(p.getJaxbElement(), dmlVmlAnalyzer);
result.unsafeObjectsByPart.put(p, dmlVmlAnalyzer.unsafeObjects);
if (dmlVmlAnalyzer.unsafeObjects.size()>0){
result.anyUnsafeObjects = true;
}
result.inventoryObjectsByPart.put(p, dmlVmlAnalyzer.inventoryObjects);
if (!result.containsVML) {
result.containsVML = dmlVmlAnalyzer.containsVML;
}
result.fieldsPresent = this.dmlVmlAnalyzer.fieldsPresent;
}
private void applyScrambleCallbackToParts()
throws InvalidFormatException {
try {
latinizer = new ScrambleText(pkg);
// Apply map to MDP
scramble( pkg.getMainDocumentPart() );
// Apply map to headers/footers
for (Entry entry : pkg.getParts().getParts().entrySet()) {
Part p = entry.getValue();
if (p instanceof HeaderPart) {
scramble( (HeaderPart)p );
}
if (p instanceof FooterPart) {
scramble( (FooterPart)p );
}
}
// endnotes/footnotes
if (pkg.getMainDocumentPart().getFootnotesPart()!=null) {
scramble( pkg.getMainDocumentPart().getFootnotesPart() );
}
if (pkg.getMainDocumentPart().getEndNotesPart()!=null) {
scramble( pkg.getMainDocumentPart().getEndNotesPart() );
}
// Comments
if (pkg.getMainDocumentPart().getCommentsPart()!=null) {
scramble( pkg.getMainDocumentPart().getCommentsPart() );
}
// collected at the package level
// result.mostPopularLang = latinizer.langFromLangStats();
result.hasGreek = latinizer.hasGreek;
result.hasCyrillic = latinizer.hasCyrillic;
result.hasHebrew = latinizer.hasHebrew;
result.hasArabic = latinizer.hasArabic;
result.hasHiragana = latinizer.hasHiragana;
result.hasKatakana = latinizer.hasKatakana;
result.hasCJK = latinizer.hasCJK;
return;
} catch (Exception e) {
e.printStackTrace();
throw new InvalidFormatException(e.getMessage(), e);
}
}
public void scramble(JaxbXmlPart p) {
log.info("\n\n Scrambling " + p.getPartName().getName());
new TraversalUtil(p.getJaxbElement(), latinizer);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy