All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.itextpdf.pdfcleanup.PdfCleanUpProcessor Maven / Gradle / Ivy

The newest version!
/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2024 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see .
 */
package com.itextpdf.pdfcleanup;

import com.itextpdf.io.image.ImageData;
import com.itextpdf.io.image.ImageDataFactory;
import com.itextpdf.io.source.ByteUtils;
import com.itextpdf.kernel.exceptions.PdfException;
import com.itextpdf.kernel.colors.Color;
import com.itextpdf.kernel.font.PdfFont;
import com.itextpdf.kernel.geom.BezierCurve;
import com.itextpdf.kernel.geom.IShape;
import com.itextpdf.kernel.geom.Matrix;
import com.itextpdf.kernel.geom.Path;
import com.itextpdf.kernel.geom.Point;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.geom.Subpath;
import com.itextpdf.kernel.pdf.PdfArray;
import com.itextpdf.kernel.pdf.PdfDictionary;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfLiteral;
import com.itextpdf.kernel.pdf.PdfName;
import com.itextpdf.kernel.pdf.PdfNumber;
import com.itextpdf.kernel.pdf.PdfObject;
import com.itextpdf.kernel.pdf.PdfPage;
import com.itextpdf.kernel.pdf.PdfResources;
import com.itextpdf.kernel.pdf.PdfStream;
import com.itextpdf.kernel.pdf.PdfTextArray;
import com.itextpdf.kernel.pdf.annot.PdfAnnotation;
import com.itextpdf.kernel.pdf.annot.PdfLineAnnotation;
import com.itextpdf.kernel.pdf.annot.PdfLinkAnnotation;
import com.itextpdf.kernel.pdf.annot.PdfMarkupAnnotation;
import com.itextpdf.kernel.pdf.annot.PdfPopupAnnotation;
import com.itextpdf.kernel.pdf.annot.PdfTextMarkupAnnotation;
import com.itextpdf.kernel.pdf.canvas.CanvasGraphicsState;
import com.itextpdf.kernel.pdf.canvas.CanvasTag;
import com.itextpdf.kernel.pdf.canvas.PdfCanvas;
import com.itextpdf.kernel.pdf.canvas.PdfCanvasConstants;
import com.itextpdf.kernel.pdf.canvas.PdfCanvasConstants.FillingRule;
import com.itextpdf.kernel.pdf.canvas.parser.EventType;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
import com.itextpdf.kernel.pdf.canvas.parser.data.ImageRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.data.PathRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.listener.IEventListener;
import com.itextpdf.kernel.pdf.colorspace.shading.AbstractPdfShading;
import com.itextpdf.kernel.pdf.tagutils.TagTreePointer;
import com.itextpdf.kernel.pdf.xobject.PdfFormXObject;
import com.itextpdf.kernel.pdf.xobject.PdfImageXObject;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Deque;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;

import com.itextpdf.pdfcleanup.logs.CleanUpLogMessageConstant;
import com.itextpdf.pdfcleanup.util.CleanUpCsCompareUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * PDF content stream processor, which filters content to be cleaned up.
 */
public class PdfCleanUpProcessor extends PdfCanvasProcessor {

    private static final Set TEXT_SHOWING_OPERATORS = Collections.unmodifiableSet(new HashSet(
            Arrays.asList("TJ", "Tj", "'", "\"")));
    private static final Set PATH_CONSTRUCTION_OPERATORS = Collections.unmodifiableSet(new HashSet(
            Arrays.asList("m", "l", "c", "v", "y", "h", "re")));
    private static final Set STROKE_OPERATORS = Collections.unmodifiableSet(new HashSet(
            Arrays.asList("S", "s", "B", "B*", "b", "b*")));
    private static final Set NW_FILL_OPERATORS = Collections.unmodifiableSet(new HashSet(
            Arrays.asList("f", "F", "B", "b")));
    private static final Set EO_FILL_OPERATORS = Collections.unmodifiableSet(new HashSet(
            Arrays.asList("f*", "B*", "b*")));
    private static final Set PATH_PAINTING_OPERATORS;
    private static final Set CLIPPING_PATH_OPERATORS = Collections.unmodifiableSet(new HashSet(
            Arrays.asList("W", "W*")));
    private static final Set LINE_STYLE_OPERATORS = Collections.unmodifiableSet(new HashSet(
            Arrays.asList("w", "J", "j", "M", "d")));
    private static final Set STROKE_COLOR_OPERATORS = Collections.unmodifiableSet(new HashSet(
            Arrays.asList("CS", "SC", "SCN", "G", "RG", "K")));
    private static final Set FILL_COLOR_OPERATORS = Collections.unmodifiableSet(new HashSet(
            Arrays.asList("cs", "sc", "scn", "g", "rg", "k")));

    // TL actually is not a text positioning operator, but we need to process it with them
    private static final Set TEXT_POSITIONING_OPERATORS = Collections.unmodifiableSet(new HashSet<>(
            Arrays.asList("Td", "TD", "Tm", "T*", "TL")));

    // these operators are processed via PdfCanvasProcessor graphics state and event listener
    private static final Set IGNORED_OPERATORS;

    static {
        // HashSet is required in order to autoport correctly in .Net
        HashSet tempSet = new HashSet<>();
        tempSet.addAll(STROKE_OPERATORS);
        tempSet.addAll(NW_FILL_OPERATORS);
        tempSet.addAll(EO_FILL_OPERATORS);
        tempSet.add("n");
        PATH_PAINTING_OPERATORS = Collections.unmodifiableSet(tempSet);

        tempSet = new HashSet<>();
        tempSet.addAll(PATH_CONSTRUCTION_OPERATORS);
        tempSet.addAll(CLIPPING_PATH_OPERATORS);
        tempSet.addAll(LINE_STYLE_OPERATORS);
        tempSet.addAll(Arrays.asList("Tc", "Tw", "Tz", "Tf", "Tr", "Ts"));
        tempSet.addAll(Arrays.asList("BMC", "BDC"));
        IGNORED_OPERATORS = Collections.unmodifiableSet(tempSet);
    }

    private PdfDocument document;
    private PdfPage currentPage;
    private PdfCleanUpFilter filter;
    private Stack canvasStack;

    private boolean removeAnnotIfPartOverlap = true;

    /**
     * In {@code notAppliedGsParams} field not written graphics state params are stored.
     * Stack represents gs params on different levels of the q/Q nesting (see {@link NotAppliedGsParams}).
     * On "q" operator new {@code NotAppliedGsParams} is pushed to the stack and on "Q" it is popped.
     * 

* When operators are applied, they are written from the outer to inner nesting level, separated by "q". * After being written the stack is cleared. *

* Graphics state parameters are applied in two ways: *

    *
  • * first - right before writing text content, text state in current gs is compare to the text state of the text * render info gs and difference is applied to current gs; *
  • * second - through list of the not applied gs params. Right before writing some content, this list is checked, * and if something affecting content is stored in this list it will be applied. *
*/ private Deque notAppliedGsParams; private Deque notWrittenTags; private int numOfOpenedTagsInsideText; private boolean btEncountered; private boolean isInText; private TextPositioning textPositioning; private FilteredImagesCache filteredImagesCache; PdfCleanUpProcessor(List cleanUpRegions, PdfDocument document) { this(cleanUpRegions, document, new CleanUpProperties()); } PdfCleanUpProcessor(List cleanUpRegions, PdfDocument document, CleanUpProperties properties) { super(new PdfCleanUpEventListener()); this.document = document; this.filter = new PdfCleanUpFilter(cleanUpRegions, properties); this.canvasStack = new Stack<>(); this.notAppliedGsParams = new ArrayDeque<>(); this.notAppliedGsParams.push(new NotAppliedGsParams()); this.notWrittenTags = new ArrayDeque<>(); this.numOfOpenedTagsInsideText = 0; this.btEncountered = false; this.isInText = false; this.textPositioning = new TextPositioning(); } @Override public void processPageContent(PdfPage page) { currentPage = page; super.processPageContent(page); } /** * Process the annotations of a page. * Default process behaviour is to remove the annotation if there is (partial) overlap with a redaction region * * @param page the page to process * @param regions a list of redaction regions * @param redactRedactAnnotations true if annotation with subtype /Redact should also be removed */ public void processPageAnnotations(PdfPage page, List regions, boolean redactRedactAnnotations) { // Iterate over annotations for (PdfAnnotation annot : page.getAnnotations()) { PdfName annotSubtype = annot.getSubtype(); if (PdfName.Popup.equals(annotSubtype)) { // we handle popup annots together with PdfMarkupAnnotation annots only continue; } if (!redactRedactAnnotations && PdfName.Redact.equals(annotSubtype)) { continue; } // Check against regions for (Rectangle region : regions) { if (annotationIsToBeRedacted(annot, region)) { if (annot instanceof PdfMarkupAnnotation) { PdfPopupAnnotation popup = ((PdfMarkupAnnotation) annot).getPopup(); if (popup != null) { page.removeAnnotation(popup); } } page.removeAnnotation(annot); break; } } } } void setFilteredImagesCache(FilteredImagesCache cache) { this.filteredImagesCache = cache; } /** * @param contentBytes the bytes of a content stream * @param resources the resources of the content stream. Must not be null. */ @Override public void processContent(byte[] contentBytes, PdfResources resources) { canvasStack.push(new PdfCanvas(new PdfStream(), new PdfResources(), document)); if (canvasStack.size() == 1) { // If it is the first canvas, we begin to wrap it with q getCanvas().saveState(); } super.processContent(contentBytes, resources); // Here we don't pop() canvases by intent. It is the responsibility of the one who utilizes the canvas data } @Override public IEventListener getEventListener() { return eventListener; } PdfCanvas popCleanedCanvas() { // If it is the last canvas, we finish to wrap it with Q if (canvasStack.size() == 1) { getCanvas().restoreState(); } return canvasStack.pop(); } @Override protected void invokeOperator(PdfLiteral operator, List operands) { String operatorString = operator.toString(); writeGsParamsIfFormXObject(operatorString, operands); super.invokeOperator(operator, operands); popCanvasIfFormXObject(operatorString, operands); filterContent(operatorString, operands); } @Override protected void beginMarkedContent(PdfName tag, PdfDictionary dict) { super.beginMarkedContent(tag, dict); notWrittenTags.push(new CanvasTag(tag).setProperties(dict)); if (btEncountered) { ++numOfOpenedTagsInsideText; } } static void writeOperands(PdfCanvas canvas, List operands) { int index = 0; for (PdfObject obj : operands) { canvas.getContentStream().getOutputStream().write(obj); if (operands.size() > ++index) { canvas.getContentStream().getOutputStream().writeSpace(); } else { canvas.getContentStream().getOutputStream().writeNewLine(); } } } static Matrix operandsToMatrix(List operands) { float a = ((PdfNumber) operands.get(0)).floatValue(); float b = ((PdfNumber) operands.get(1)).floatValue(); float c = ((PdfNumber) operands.get(2)).floatValue(); float d = ((PdfNumber) operands.get(3)).floatValue(); float e = ((PdfNumber) operands.get(4)).floatValue(); float f = ((PdfNumber) operands.get(5)).floatValue(); return new Matrix(a, b, c, d, e, f); } @Override protected void eventOccurred(IEventData data, EventType type) { if (supportedEvents == null || supportedEvents.contains(type)) { eventListener.eventOccurred(data, type); } } /** * Returns the last canvas without removing it. * * @return the last canvas in canvasStack. */ PdfCanvas getCanvas() { return canvasStack.peek(); } /** * Adds tag to the deque of not written tags. * * @param tag tag to be added. */ void addNotWrittenTag(CanvasTag tag) { notWrittenTags.push(tag); } /** * Opens all tags from deque of not written tags. Should be called before some content is drawn. */ void openNotWrittenTags() { CanvasTag tag = notWrittenTags.pollLast(); while (tag != null) { getCanvas().openTag(tag); tag = notWrittenTags.pollLast(); } } private boolean annotationIsToBeRedacted(PdfAnnotation annotation, Rectangle redactRegion) { // TODO(DEVSIX-1605,DEVSIX-1606,DEVSIX-1607,DEVSIX-1608,DEVSIX-1609) removeAnnotIfPartOverlap = true; PdfName annotationType = annotation.getPdfObject().getAsName(PdfName.Subtype); if (annotationType.equals(PdfName.Watermark)) { // TODO /FixedPrint entry effect is not fully investigated: DEVSIX-2471 Logger logger = LoggerFactory.getLogger(PdfCleanUpProcessor.class); logger.warn(CleanUpLogMessageConstant.REDACTION_OF_ANNOTATION_TYPE_WATERMARK_IS_NOT_SUPPORTED); } PdfArray rectAsArray = annotation.getRectangle(); Rectangle rect = null; if (rectAsArray != null) { rect = rectAsArray.toRectangle(); } boolean annotationIsToBeRedacted = processAnnotationRectangle(redactRegion, rect); // Special processing for some types of annotations. if (PdfName.Link.equals(annotationType)) { PdfArray quadPoints = ((PdfLinkAnnotation) annotation).getQuadPoints(); if (quadPointsForLinkAnnotationAreValid(rect, quadPoints)) { annotationIsToBeRedacted = processAnnotationQuadPoints(redactRegion, quadPoints); } } else if (annotationType.equals(PdfName.Highlight) || annotationType.equals(PdfName.Underline) || annotationType.equals(PdfName.Squiggly) || annotationType.equals(PdfName.StrikeOut)) { PdfArray quadPoints = ((PdfTextMarkupAnnotation) annotation).getQuadPoints(); // The annotation dictionary’s AP entry, if present, shall take precedence over QuadPoints. if (quadPoints != null && annotation.getAppearanceDictionary() == null) { try { annotationIsToBeRedacted = processAnnotationQuadPoints(redactRegion, quadPoints); } catch (PdfException ignored) { // if quad points array cannot be processed, simply ignore it } } } else if (annotationType.equals(PdfName.Line)) { PdfArray line = ((PdfLineAnnotation) annotation).getLine(); if (line != null) { Rectangle drawnLineRectangle = line.toRectangle(); // Line annotation might contain line leaders, so let's double check overlapping with /Rect area, for simplicity. // TODO DEVSIX-1607 annotationIsToBeRedacted = annotationIsToBeRedacted || processAnnotationRectangle(redactRegion, drawnLineRectangle); } } return annotationIsToBeRedacted; } private boolean processAnnotationQuadPoints(Rectangle redactRegion, PdfArray quadPoints) { List boundingRectangles = Rectangle.createBoundingRectanglesFromQuadPoint(quadPoints); boolean bboxOverlapped = false; for (Rectangle bbox : boundingRectangles) { bboxOverlapped = bboxOverlapped || processAnnotationRectangle(redactRegion, bbox); } return bboxOverlapped; } private boolean processAnnotationRectangle(Rectangle redactRegion, Rectangle annotationRect) { if (annotationRect == null) { return false; } // 3 possible situations: full overlap, partial overlap, no overlap if (redactRegion.overlaps(annotationRect)) { if (redactRegion.contains(annotationRect)) { // full overlap return true; } Rectangle intersectionRect = redactRegion.getIntersection(annotationRect); if (intersectionRect != null) { // partial overlap if (removeAnnotIfPartOverlap) { return true; } else { //TODO (DEVSIX-1605,DEVSIX-1606,DEVSIX-1609) } } } // No overlap, do nothing return false; } /** * For a link annotation, a quadpoints array can be specified * but it will be ignored in favour of the rectangle * if one of the points is located outside the rectangle's boundaries * * @param rect rectangle entry of the link annotation * @param quadPoints An array of 8 × n numbers specifying the coordinates of n quadrilaterals * in default user space that comprise the region in which the link should be activated. * @return true if the quad points are valid, false if the quadpoint array should be used */ private boolean quadPointsForLinkAnnotationAreValid(Rectangle rect, PdfArray quadPoints) { if (quadPoints == null || quadPoints.isEmpty() || quadPoints.size() % 8 != 0) { return false; } for (int i = 0; i < quadPoints.size(); i += 8) { for (int j = 0; j < 8; j += 2) { PdfNumber pointX = quadPoints.getAsNumber(i + j); PdfNumber pointY = quadPoints.getAsNumber(i + j + 1); if (pointX == null || pointY == null) { return false; } float x = pointX.floatValue(); float y = pointY.floatValue(); if (rect != null && !rect.contains(new Rectangle(x, y, 0, 0))) { return false; } } } return true; } private void writeGsParamsIfFormXObject(String operator, List operands) { if ("Do".equals(operator)) { PdfStream formStream = getXObjectStream((PdfName) operands.get(0)); if (PdfName.Form.equals(formStream.getAsName(PdfName.Subtype))) { writeNotAppliedGsParams(true, true); openNotWrittenTags(); } } } private void popCanvasIfFormXObject(String operator, List operands) { if ("Do".equals(operator)) { PdfStream formStream = getXObjectStream((PdfName) operands.get(0)); if (PdfName.Form.equals(formStream.getAsName(PdfName.Subtype))) { PdfCanvas cleanedCanvas = popCleanedCanvas(); PdfFormXObject newFormXObject = new PdfFormXObject((Rectangle) null); newFormXObject.getPdfObject().putAll(formStream); if (formStream.containsKey(PdfName.Resources)) { newFormXObject.put(PdfName.Resources, cleanedCanvas.getResources().getPdfObject()); } newFormXObject.getPdfObject().setData(cleanedCanvas.getContentStream().getBytes()); PdfName name = getCanvas().getResources().addForm(newFormXObject); getCanvas().getContentStream().getOutputStream().write(name).writeSpace().writeBytes(ByteUtils.getIsoBytes("Do\n")); } } } private void filterContent(String operator, List operands) { if (TEXT_SHOWING_OPERATORS.contains(operator)) { cleanText(operator, operands); } else if ("Do".equals(operator)) { checkIfImageAndClean(operands); } else if ("EI".equals(operator)) { cleanInlineImage(); } else if (PATH_PAINTING_OPERATORS.contains(operator)) { writePath(); } else if ("q".equals(operator)) { notAppliedGsParams.push(new NotAppliedGsParams()); } else if ("Q".equals(operator)) { notAppliedGsParams.pop(); if (notAppliedGsParams.size() == 0) { getCanvas().restoreState(); notAppliedGsParams.push(new NotAppliedGsParams()); } } else if ("BT".equals(operator)) { btEncountered = true; } else if ("ET".equals(operator)) { if (isInText) { writeOperands(getCanvas(), operands); isInText = false; } btEncountered = false; textPositioning.clear(); } else if (TEXT_POSITIONING_OPERATORS.contains(operator)) { textPositioning.appendPositioningOperator(operator, operands); } else if ("EMC".equals(operator)) { // BMC and BDC are handled with BeginMarkedContent method removeOrCloseTag(); } else if (LINE_STYLE_OPERATORS.contains(operator)) { notAppliedGsParams.peek().lineStyleOperators.put(operator, new ArrayList<>(operands)); } else if ("gs".equals(operator)) { notAppliedGsParams.peek().extGStates.add(getResources().getResource(PdfName.ExtGState).getAsDictionary((PdfName) operands.get(0))); } else if ("cm".equals(operator)) { notAppliedGsParams.peek().ctms.add(new ArrayList<>(operands)); } else if (STROKE_COLOR_OPERATORS.contains(operator)) { notAppliedGsParams.peek().strokeColor = getGraphicsState().getStrokeColor(); } else if (FILL_COLOR_OPERATORS.contains(operator)) { notAppliedGsParams.peek().fillColor = getGraphicsState().getFillColor(); } else if ("sh".equals(operator)) { AbstractPdfShading shading = getResources().getShading((PdfName) operands.get(0)); getCanvas().paintShading(shading); } else if (!IGNORED_OPERATORS.contains(operator)) { writeOperands(getCanvas(), operands); } } private void cleanText(String operator, List operands) { List textChunks = null; PdfArray cleanedText = null; if ("TJ".equals(operator)) { PdfArray originalTJ = (PdfArray) operands.get(0); if (originalTJ.isEmpty()) { // empty TJ neither shows any text nor affects text positioning // we can safely ignore it return; } int i = 0; // text chunk index in original TJ PdfTextArray newTJ = new PdfTextArray(); for (PdfObject e : originalTJ) { if (e.isString()) { if (null == textChunks) { textChunks = ((PdfCleanUpEventListener) getEventListener()).getEncounteredText(); } PdfArray filteredText = filter.filterText(textChunks.get(i++)).getFilterResult(); newTJ.addAll(filteredText); } else { newTJ.add(e); } } cleanedText = newTJ; } else { // if operator is Tj or ' or " textChunks = ((PdfCleanUpEventListener) getEventListener()).getEncounteredText(); PdfCleanUpFilter.FilterResult filterResult = filter.filterText(textChunks.get(0)); if (filterResult.isModified()) { cleanedText = filterResult.getFilterResult(); } } // if text wasn't modified cleanedText is null if (cleanedText == null || cleanedText.size() != 1 || !cleanedText.get(0).isNumber()) { if (null == textChunks) { textChunks = ((PdfCleanUpEventListener) getEventListener()).getEncounteredText(); } TextRenderInfo text = textChunks.get(0); // all text chunks even in case of TJ have the same graphics state writeNotAppliedGsParamsForText(text); beginTextObjectAndOpenNotWrittenTags(); writeNotAppliedTextStateParams(text); textPositioning.writePositionedText(operator, operands, cleanedText, getCanvas()); } else { // cleaned text is tj array with single number - it means that the whole text chunk was removed CanvasGraphicsState gs = getCanvas().getGraphicsState(); // process new lines if necessary if ("'".equals(operator) || "\"".equals(operator)) { List newLineList = new ArrayList<>(); newLineList.add(new PdfLiteral("T*")); textPositioning.appendPositioningOperator("T*", newLineList); } textPositioning.appendTjArrayWithSingleNumber(cleanedText, gs.getFontSize(), gs.getHorizontalScaling()); } } private void beginTextObjectAndOpenNotWrittenTags() { if (!isInText) { int numOfTagsBeforeBT = notWrittenTags.size() - numOfOpenedTagsInsideText; CanvasTag tag; for (int i = 0; i < numOfTagsBeforeBT; ++i) { tag = notWrittenTags.pollLast(); getCanvas().openTag(tag); } getCanvas().beginText(); isInText = true; openNotWrittenTags(); } else { openNotWrittenTags(); } } private void writeNotAppliedTextStateParams(TextRenderInfo text) { PdfCanvas canvas = getCanvas(); CanvasGraphicsState currGs = canvas.getGraphicsState(); if (currGs.getCharSpacing() != text.getCharSpacing()) { canvas.setCharacterSpacing(text.getCharSpacing()); } if (currGs.getWordSpacing() != text.getWordSpacing()) { canvas.setWordSpacing(text.getWordSpacing()); } if (currGs.getHorizontalScaling() != text.getHorizontalScaling()) { canvas.setHorizontalScaling(text.getHorizontalScaling()); } // not writing leading here, it is processed along with positioning operators PdfFont currFont = currGs.getFont(); if (currFont == null || currFont.getPdfObject() != text.getFont().getPdfObject() || currGs.getFontSize() != text.getFontSize()) { canvas.setFontAndSize(text.getFont(), text.getFontSize()); } if (currGs.getTextRenderingMode() != text.getTextRenderMode()) { canvas.setTextRenderingMode(text.getTextRenderMode()); } if (currGs.getTextRise() != text.getRise()) { canvas.setTextRise(text.getRise()); } } private void writeNotAppliedGsParamsForText(TextRenderInfo textRenderInfo) { boolean stroke = false; boolean fill = false; switch (textRenderInfo.getTextRenderMode()) { case PdfCanvasConstants.TextRenderingMode.STROKE: case PdfCanvasConstants.TextRenderingMode.STROKE_CLIP: stroke = true; break; case PdfCanvasConstants.TextRenderingMode.FILL: case PdfCanvasConstants.TextRenderingMode.FILL_CLIP: fill = true; break; case PdfCanvasConstants.TextRenderingMode.FILL_STROKE: case PdfCanvasConstants.TextRenderingMode.FILL_STROKE_CLIP: stroke = true; fill = true; break; } writeNotAppliedGsParams(fill, stroke); } private void checkIfImageAndClean(List operands) { PdfStream imageStream = getXObjectStream((PdfName) operands.get(0)); if (PdfName.Image.equals(imageStream.getAsName(PdfName.Subtype))) { ImageRenderInfo encounteredImage = ((PdfCleanUpEventListener) getEventListener()).getEncounteredImage(); FilteredImagesCache.FilteredImageKey key = filter.createFilteredImageKey(encounteredImage.getImage(), encounteredImage.getImageCtm(), document); PdfImageXObject imageToWrite = getFilteredImage(key, encounteredImage.getImageCtm()); if (imageToWrite != null) { float[] ctm = pollNotAppliedCtm(); writeNotAppliedGsParams(false, false); openNotWrittenTags(); getCanvas().addXObjectWithTransformationMatrix(imageToWrite, ctm[0], ctm[1], ctm[2], ctm[3], ctm[4], ctm[5]); } } } private PdfImageXObject getFilteredImage(FilteredImagesCache.FilteredImageKey filteredImageKey, Matrix ctmForMasksFiltering) { PdfImageXObject originalImage = filteredImageKey.getImageXObject(); PdfImageXObject imageToWrite = getFilteredImagesCache().get(filteredImageKey); if (imageToWrite == null) { PdfCleanUpFilter.FilterResult imageFilterResult = filter.filterImage(filteredImageKey); if (imageFilterResult.isModified()) { ImageData filteredImageData = imageFilterResult.getFilterResult(); if (filteredImageData != null) { if (Boolean.TRUE.equals(originalImage.getPdfObject().getAsBool(PdfName.ImageMask))) { if (!PdfCleanUpFilter.imageSupportsDirectCleanup(originalImage)) { Logger logger = LoggerFactory.getLogger(PdfCleanUpProcessor.class); logger.error(CleanUpLogMessageConstant.IMAGE_MASK_CLEAN_UP_NOT_SUPPORTED); } else { filteredImageData.makeMask(); } } imageToWrite = new PdfImageXObject(filteredImageData); getFilteredImagesCache().put(filteredImageKey, imageToWrite); // While having been processed with java libraries, only the number of components mattered. // However now we should put the correct color space dictionary as an image's resource, // because it'd be have been considered by pdf browsers before rendering it. // Additional checks required as if an image format has been changed, // then the old colorspace may produce an error with the new image data. if (areColorSpacesDifferent(originalImage, imageToWrite) && CleanUpCsCompareUtil.isOriginalCsCompatible(originalImage, imageToWrite)) { PdfObject originalCS = originalImage.getPdfObject().get(PdfName.ColorSpace); if (originalCS != null) { imageToWrite.put(PdfName.ColorSpace, originalCS); } } if (ctmForMasksFiltering != null && !filteredImageData.isMask()) { filterImageMask(originalImage, PdfName.SMask, ctmForMasksFiltering, imageToWrite); filterImageMask(originalImage, PdfName.Mask, ctmForMasksFiltering, imageToWrite); PdfArray colourKeyMaskingArr = originalImage.getPdfObject().getAsArray(PdfName.Mask); if (colourKeyMaskingArr != null) { // In general we should be careful about images that might have changed their color space // or have been converted to lossy format during filtering. // However we have been copying Mask entry non-conditionally before and also I'm not sure // that cases described above indeed take place. imageToWrite.put(PdfName.Mask, colourKeyMaskingArr); } if (originalImage.getPdfObject().containsKey(PdfName.SMaskInData)) { // This entry will likely lose meaning after image conversion to bitmap and back again, but let's leave as is for now. imageToWrite.put(PdfName.SMaskInData, originalImage.getPdfObject().get(PdfName.SMaskInData)); } } } } else { imageToWrite = originalImage; } } return imageToWrite; } private void filterImageMask(PdfImageXObject originalImage, PdfName maskKey, Matrix ctmForMasksFiltering, PdfImageXObject imageToWrite) { PdfStream maskStream = originalImage.getPdfObject().getAsStream(maskKey); if (maskStream == null || ctmForMasksFiltering == null) { return; } PdfImageXObject maskImageXObject = new PdfImageXObject(maskStream); if (!PdfCleanUpFilter.imageSupportsDirectCleanup(maskImageXObject)) { Logger logger = LoggerFactory.getLogger(PdfCleanUpProcessor.class); logger.error(CleanUpLogMessageConstant.IMAGE_MASK_CLEAN_UP_NOT_SUPPORTED); return; } FilteredImagesCache.FilteredImageKey k = filter.createFilteredImageKey(maskImageXObject, ctmForMasksFiltering, document); PdfImageXObject maskToWrite = getFilteredImage(k, null); if (maskToWrite != null) { imageToWrite.getPdfObject().put(maskKey, maskToWrite.getPdfObject()); } } private FilteredImagesCache getFilteredImagesCache() { return filteredImagesCache != null ? filteredImagesCache : new FilteredImagesCache(); } private void cleanInlineImage() { ImageRenderInfo encounteredImage = ((PdfCleanUpEventListener) getEventListener()).getEncounteredImage(); PdfCleanUpFilter.FilterResult imageFilterResult = filter.filterImage(encounteredImage); ImageData filteredImage; if (imageFilterResult.isModified()) { filteredImage = imageFilterResult.getFilterResult(); } else { filteredImage = ImageDataFactory.create(encounteredImage.getImage().getImageBytes()); } if (filteredImage != null) { Boolean imageMaskFlag = encounteredImage.getImage().getPdfObject().getAsBool(PdfName.ImageMask); if (imageMaskFlag != null && (boolean) imageMaskFlag) { filteredImage.makeMask(); } float[] ctm = pollNotAppliedCtm(); writeNotAppliedGsParams(false, false); openNotWrittenTags(); getCanvas().addImageWithTransformationMatrix(filteredImage, ctm[0], ctm[1], ctm[2], ctm[3], ctm[4], ctm[5], true); } // TODO // PdfCanvas doesn't have a method that writes inline image using pdf stream, and only have method which // accepts Image as parameter. That's why we can't write image just as it was in original file, we convert it to Image. // IMPORTANT: If writing of pdf stream of not changed inline image will be implemented, don't forget to ensure that // inline image color space is present in new resources if necessary. } private void writePath() { PathRenderInfo path = ((PdfCleanUpEventListener) getEventListener()).getEncounteredPath(); boolean stroke = (path.getOperation() & PathRenderInfo.STROKE) == PathRenderInfo.STROKE; boolean fill = (path.getOperation() & PathRenderInfo.FILL) == PathRenderInfo.FILL; boolean clip = path.isPathModifiesClippingPath(); // Here we intentionally draw all three paths separately and not combining them in any way: // First of all, stroke converted to fill paths, therefore it could not be combined with fill (if it is // stroke-fill operation) or clip paths, and also it should be drawn after the fill, because in case it's // stroke-fill operation stroke should be "on top" of the filled area. // Secondly, current clipping path modifying happens AFTER the path painting. So if it is drawn separately, clip // path should be the last one. // So consider the situation when it is stroke-fill operation and also this path is marked as clip path. // And here we have it: fill path is the first, stroke path is the second and clip path is the last. And // stroke path could not be combined with neither fill nor clip paths. // Some improved logic could be applied to distinguish the cases when some paths actually could be drawn as one, // but this is the only generic solution. Path fillPath = null; PdfCanvas canvas = getCanvas(); if (fill) { fillPath = filter.filterFillPath(path, path.getRule()); if (!fillPath.isEmpty()) { writeNotAppliedGsParams(true, false); openNotWrittenTags(); writePath(fillPath); if (path.getRule() == FillingRule.NONZERO_WINDING) { canvas.fill(); } else { // FillingRule.EVEN_ODD canvas.eoFill(); } } } if (stroke) { Path strokePath = filter.filterStrokePath(path); if (!strokePath.isEmpty()) { // we pass stroke here as false, because stroke is transformed into fill. we don't need to set stroke color writeNotAppliedGsParams(false, false); openNotWrittenTags(); writeStrokePath(strokePath, path.getStrokeColor()); } } if (clip) { Path clippingPath; if (fill && path.getClippingRule() == path.getRule()) { clippingPath = fillPath; } else { clippingPath = filter.filterFillPath(path, path.getClippingRule()); } if (!clippingPath.isEmpty()) { writeNotAppliedGsParams(false, false); openNotWrittenTags(); writePath(clippingPath); if (path.getClippingRule() == FillingRule.NONZERO_WINDING) { canvas.clip(); } else { // FillingRule.EVEN_ODD canvas.eoClip(); } } else { // If the clipping path from the source document is cleaned (it happens when reduction // area covers the path completely), then you should treat it as an empty set (no points // are included in the path). Then the current clipping path (which is the intersection // between previous clipping path and the new one) is also empty set, which means that // there is no visible content at all. But at the same time as we removed the clipping // path, the invisible content would become visible. So, to emulate the correct result, // we would simply put a degenerate clipping path which consists of a single point at (0, 0). writeNotAppliedGsParams(false, false); // we still need to open all q operators canvas.moveTo(0, 0).clip(); } canvas.endPath(); } } private void writePath(Path path) { PdfCanvas canvas = getCanvas(); for (Subpath subpath : path.getSubpaths()) { canvas.moveTo((float) subpath.getStartPoint().getX(), (float) subpath.getStartPoint().getY()); for (IShape segment : subpath.getSegments()) { if (segment instanceof BezierCurve) { List basePoints = segment.getBasePoints(); Point p2 = basePoints.get(1); Point p3 = basePoints.get(2); Point p4 = basePoints.get(3); canvas.curveTo((float) p2.getX(), (float) p2.getY(), (float) p3.getX(), (float) p3.getY(), (float) p4.getX(), (float) p4.getY()); } else { // segment is Line Point destination = segment.getBasePoints().get(1); canvas.lineTo((float) destination.getX(), (float) destination.getY()); } } if (subpath.isClosed()) { canvas.closePath(); } } } private void writeStrokePath(Path strokePath, Color strokeColor) { PdfCanvas canvas = getCanvas(); // As we transformed stroke to fill, we set stroke color for filling here canvas.saveState().setFillColor(strokeColor); writePath(strokePath); canvas.fill().restoreState(); } private void removeOrCloseTag() { if (notWrittenTags.size() > 0) { CanvasTag tag = notWrittenTags.pop(); if (tag.hasMcid() && document.isTagged()) { TagTreePointer pointer = document.getTagStructureContext().removeContentItem(currentPage, tag.getMcid()); if (pointer != null) { while (pointer.getKidsRoles().size() == 0) { pointer.removeTag(); } } } } else { getCanvas().endMarkedContent(); } if (btEncountered) { --numOfOpenedTagsInsideText; } } /** * To add images and formXObjects to canvas we pass ctm. Here we try to find last not applied ctm in order to pass it to * PdfCanvas method later. Returned ctm is written right before the image, that's why we care only for not applied ctms of * the current (the "deepest") q/Q nesting level. * If such ctm wasn't found identity ctm is returned. */ private float[] pollNotAppliedCtm() { List> ctms = notAppliedGsParams.peek().ctms; if (ctms.size() == 0) { return new float[]{1, 0, 0, 1, 0, 0}; } List lastCtm = ctms.remove(ctms.size() - 1); float[] ctm = new float[6]; ctm[0] = ((PdfNumber) lastCtm.get(0)).floatValue(); ctm[1] = ((PdfNumber) lastCtm.get(1)).floatValue(); ctm[2] = ((PdfNumber) lastCtm.get(2)).floatValue(); ctm[3] = ((PdfNumber) lastCtm.get(3)).floatValue(); ctm[4] = ((PdfNumber) lastCtm.get(4)).floatValue(); ctm[5] = ((PdfNumber) lastCtm.get(5)).floatValue(); return ctm; } private void writeNotAppliedGsParams(boolean fill, boolean stroke) { if (notAppliedGsParams.size() > 0) { while (notAppliedGsParams.size() != 1) { NotAppliedGsParams gsParams = notAppliedGsParams.pollLast(); // We want to apply graphics state params of outer q/Q nesting level on it's level and not on the inner // q/Q nesting level. Because of that we write all gs params for the outer q/Q, just in case it will be needed // later (if we don't write it now, there will be no possibility to write it in the outer q/Q later). applyGsParams(true, true, gsParams); getCanvas().saveState(); } applyGsParams(fill, stroke, notAppliedGsParams.peek()); } } private void applyGsParams(boolean fill, boolean stroke, NotAppliedGsParams gsParams) { for (PdfDictionary extGState : gsParams.extGStates) { getCanvas().setExtGState(extGState); } gsParams.extGStates.clear(); if (gsParams.ctms.size() > 0) { Matrix m = new Matrix(); for (List ctm : gsParams.ctms) { m = operandsToMatrix(ctm).multiply(m); } getCanvas().concatMatrix(m.get(Matrix.I11), m.get(Matrix.I12), m.get(Matrix.I21), m.get(Matrix.I22), m.get(Matrix.I31), m.get(Matrix.I32)); gsParams.ctms.clear(); } if (stroke) { for (List strokeState : gsParams.lineStyleOperators.values()) { writeOperands(getCanvas(), strokeState); } gsParams.lineStyleOperators.clear(); } if (fill) { if (gsParams.fillColor != null) { getCanvas().setFillColor(gsParams.fillColor); } gsParams.fillColor = null; } if (stroke) { if (gsParams.strokeColor != null) { getCanvas().setStrokeColor(gsParams.strokeColor); } gsParams.strokeColor = null; } } static boolean areColorSpacesDifferent(PdfImageXObject originalImage, PdfImageXObject clearedImage) { PdfObject originalImageCS = originalImage.getPdfObject().get(PdfName.ColorSpace); PdfObject clearedImageCS = clearedImage.getPdfObject().get(PdfName.ColorSpace); if (originalImageCS == clearedImageCS) { return false; } else if (originalImageCS == null || clearedImageCS == null) { return true; } else if (originalImageCS.equals(clearedImageCS)) { return false; } else if (originalImageCS.isArray() && clearedImageCS.isArray()) { PdfArray originalCSArray = (PdfArray) originalImageCS; PdfArray clearedCSArray = (PdfArray) clearedImageCS; if (originalCSArray.size() != clearedCSArray.size()) { return true; } for (int i = 0; i < originalCSArray.size(); ++i) { PdfObject objectFromOriginal = originalCSArray.get(i); PdfObject objectFromCleared = clearedCSArray.get(i); if (!objectFromOriginal.equals(objectFromCleared)) { return true; } } return false; } return true; } /** * Single instance of this class represents not applied graphics state params of the single q/Q nesting level. * For example: *

* 0 g * 1 0 0 1 25 50 cm *

* q *

* 5 w * /Gs1 gs * 13 g *

* Q *

* 1 0 0 RG *

* Operators "0 g", "1 0 0 1 25 50 cm" and "1 0 0 RG" belong to the outer q/Q nesting level; * Operators "5 w", "/Gs1 gs", "13 g" belong to the inner q/Q nesting level. * Operators of every level of the q/Q nesting are stored in different instances of this class. */ static class NotAppliedGsParams { List extGStates = new ArrayList<>(); List> ctms = new ArrayList<>(); // list of operator statements Color fillColor; Color strokeColor; Map> lineStyleOperators = new LinkedHashMap<>(); // operator and it's operands } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy