com.day.cq.wcm.offline.DocxImporter Maven / Gradle / Ivy

/*************************************************************************
 *
 * ADOBE CONFIDENTIAL
 * ___________________
 *
 *  Copyright 2013 Adobe Systems Incorporated
 *  All Rights Reserved.
 *
 * NOTICE:  All information contained herein is, and remains
 * the property of Adobe Systems Incorporated and its suppliers,
 * if any.  The intellectual and technical concepts contained
 * herein are proprietary to Adobe Systems Incorporated and its
 * suppliers and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from Adobe Systems Incorporated.
 **************************************************************************/
package com.day.cq.wcm.offline;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.poi.POIXMLException;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.xwpf.usermodel.Document;
import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
import org.apache.poi.xwpf.usermodel.VerticalAlign;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPicture;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An implementation of {@link TextDocumentImporter} for "modern" MS Word
 * documents ("docx").
 * @deprecated since 6.3
 */
@Deprecated
public class DocxImporter implements TextDocumentImporter {

    private static final Logger log = LoggerFactory.getLogger(DocxImporter.class);

    private final XWPFDocument document;
    private final List paragraphs;

    public DocxImporter(InputStream stream) throws TextImportException, IOException {
        try {
            this.document = new XWPFDocument(stream);
            List tmp = new ArrayList();
            for (XWPFParagraph p : this.document.getParagraphs()) {
                if (!p.isEmpty() && !p.isPageBreak()) {
                    tmp.add(p);
                }
            }
            this.paragraphs = Collections.unmodifiableList(tmp);
        } catch (POIXMLException ex) {
            throw new TextImportException("not a docx file", ex);
        }
    }

    /**
     * @return contents of first non-empty paragraph
     */
    public String getTitle() {
        for (XWPFParagraph p : this.paragraphs) {
            String t = p.getText().trim();
            if (!"".equals(t)) {
                return t;
            }
        }
        return null;
    }

    public int getNumberOfParagraphs() {
        return this.paragraphs.size();
    }

    public Paragraph getParagraph(int index) {
        return new DocxParagraph(this.paragraphs.get(index));
    }

    private class DocxParagraph implements Paragraph {

        private final XWPFParagraph p;
        private final String text;
        private final String textHTML;
        private final List pictures;

        public DocxParagraph(XWPFParagraph p) {
            this.p = p;

            String classname = WordStyleSupport.makeClassName(p.getStyle());
            String container = "p";
            String elemname = WordStyleSupport.toHtmlElement(classname);
            if (elemname != null) {
                container = elemname;
                classname = null;
            }

            StringBuilder sbtext = new StringBuilder();
            StringBuilder sbhtml = new StringBuilder();
            List pics = new ArrayList();

            for (XWPFRun run : this.p.getRuns()) {

                if (run.isBold()) {
                    sbhtml.append("");
                }
                if (run.isItalic()) {
                    sbhtml.append("");
                }
                if (run.isStrike()) {
                    sbhtml.append("");
                }
                if (run.getUnderline() != UnderlinePatterns.NONE) {
                    sbhtml.append("");
                }
                if (run.getSubscript() == VerticalAlign.SUBSCRIPT) {
                    sbhtml.append("_{");
                }
                if (run.getSubscript() == VerticalAlign.SUPERSCRIPT) {
                    sbhtml.append("^{");
                }

                String textdata = run.getText(0);
                if (textdata != null) {
                    sbtext.append(textdata);
                    sbhtml.append(HtmlUtil.escapeHtmlText(textdata));
                }

                if (run.getSubscript() == VerticalAlign.SUPERSCRIPT) {
                    sbhtml.append("}");
                }
                if (run.getSubscript() == VerticalAlign.SUBSCRIPT) {
                    sbhtml.append("}");
                }
                if (run.getUnderline() != UnderlinePatterns.NONE) {
                    sbhtml.append("");
                }
                if (run.isStrike()) {
                    sbhtml.append("");
                }
                if (run.isItalic()) {
                    sbhtml.append("");
                }
                if (run.isBold()) {
                    sbhtml.append("");
                }

                for (XWPFPicture pic : run.getEmbeddedPictures()) {
                    pics.add(new DocxPicture(pic));
                }
            }

            String result = sbhtml.toString().trim();

            if (result.length() > 0) {
                StringBuilder tmp = new StringBuilder();
                tmp.append("<");
                tmp.append(container);
                if (classname != null) {
                    tmp.append(" class='" + HtmlUtil.escapeHtmlAttr(classname) + "'");
                }

                tmp.append(">");

                tmp.append(result);

                tmp.append("");

                result = tmp.toString();
            }

            this.text = sbtext.toString().trim();
            this.textHTML = result;
            this.pictures = Collections.unmodifiableList(pics);
        }

        public String getText() {
            return text;
        }

        public String getHTML() {
            return textHTML;
        }

        public List getPictures() {
            return pictures;
        }
    }

    private class DocxPicture implements Picture {

        private final XWPFPicture picture;

        public DocxPicture(XWPFPicture picture) {
            this.picture = picture;
        }

        public String getMediaType() {
            switch (this.picture.getPictureData().getPictureType()) {
            case Document.PICTURE_TYPE_GIF:
                return "image/gif";
            case Document.PICTURE_TYPE_JPEG:
                return "image/jepg";
            case Document.PICTURE_TYPE_PNG:
                return "image/png";
            default:
                log.error("Unknown picture type " + this.picture.getPictureData().getPictureType()
                        + " - need to define media type mapping");
                return null;
            }
        }

        public byte[] getBytes() {
            return this.picture.getPictureData().getData();
        }

        @Override
        public String toString() {
            return getMediaType() + " (" + getBytes().length + " bytes)";
        }
    }
}