All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.microsoft.WordExtractor Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.model.FieldsDocumentPart;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.model.SavedByEntry;
import org.apache.poi.hwpf.model.SavedByTable;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Field;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

public class WordExtractor extends AbstractPOIFSExtractor {

    private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
    private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
    // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3
    private static final String LIST_DELIMITER = " ";
    private static final Map fixedParagraphStyles = new HashMap();
    private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p", null);

    static {
        fixedParagraphStyles.put("Default", defaultParagraphStyle);
        fixedParagraphStyles.put("Normal", defaultParagraphStyle);
        fixedParagraphStyles.put("heading", new TagAndStyle("h1", null));
        fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null));
        fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title"));
        fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", "subtitle"));
        fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null));
    }

    private final boolean extractDeletedContent;
    // True if we are currently in the named style tag:
    private boolean curStrikeThrough;
    private boolean curBold;
    private boolean curItalic;

    private final Metadata metadata;

    public WordExtractor(ParseContext context, Metadata metadata) {
        super(context);
        this.metadata = metadata;
        extractDeletedContent = context.get(OfficeParserConfig.class).getIncludeDeletedContent();
    }

    private static int countParagraphs(Range... ranges) {
        int count = 0;
        for (Range r : ranges) {
            if (r != null) {
                count += r.numParagraphs();
            }
        }
        return count;
    }

    /**
     * Given a style name, return what tag should be used, and
     * what style should be applied to it.
     */
    public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) {
        TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName);
        if (tagAndStyle != null) {
            return tagAndStyle;
        }

        if (styleName.equals("Table Contents") && isTable) {
            return defaultParagraphStyle;
        }

        String tag = "p";
        String styleClass = null;

        if (styleName.startsWith("heading") || styleName.startsWith("Heading")) {
            // "Heading 3" or "Heading2" or "heading 4"
            int num = 1;
            try {
                num = Integer.parseInt(
                        styleName.substring(styleName.length() - 1)
                );
            } catch (NumberFormatException e) {
            }
            // Turn it into a H1 - H6 (H7+ isn't valid!)
            tag = "h" + Math.min(num, 6);
        } else {
            styleClass = styleName.replace(' ', '_');
            styleClass = styleClass.substring(0, 1).toLowerCase(Locale.ROOT) +
                    styleClass.substring(1);
        }

        return new TagAndStyle(tag, styleClass);
    }

    protected void parse(
            NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
            throws IOException, SAXException, TikaException {
        parse(filesystem.getRoot(), xhtml);
    }

    protected void parse(
            DirectoryNode root, XHTMLContentHandler xhtml)
            throws IOException, SAXException, TikaException {
        HWPFDocument document;
        try {
            document = new HWPFDocument(root);
        } catch (org.apache.poi.EncryptedDocumentException e) {
                throw new EncryptedDocumentException(e);
        } catch (OldWordFileFormatException e) {
            parseWord6(root, xhtml);
            return;
        }

        extractSavedByMetadata(document);

        org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
                new org.apache.poi.hwpf.extractor.WordExtractor(document);
        HeaderStories headerFooter = new HeaderStories(document);

        // Grab the list of pictures. As far as we can tell,
        //  the pictures should be in order, and may be directly
        //  placed or referenced from an anchor
        PicturesTable pictureTable = document.getPicturesTable();
        PicturesSource pictures = new PicturesSource(document);

        // Do any headers, if present
        Range[] headers = new Range[]{headerFooter.getFirstHeaderSubrange(),
                headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange()};
        handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);

        // Do the main paragraph text
        Range r = document.getRange();
        ListManager listManager = new ListManager(document);
        for (int i = 0; i < r.numParagraphs(); i++) {
            Paragraph p = r.getParagraph(i);
            i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
        }

        // Do everything else
        for (String paragraph : wordExtractor.getMainTextboxText()) {
            xhtml.element("p", paragraph);
        }

        for (String paragraph : wordExtractor.getFootnoteText()) {
            xhtml.element("p", paragraph);
        }

        for (String paragraph : wordExtractor.getCommentsText()) {
            xhtml.element("p", paragraph);
        }

        for (String paragraph : wordExtractor.getEndnoteText()) {
            xhtml.element("p", paragraph);
        }

        // Do any footers, if present
        Range[] footers = new Range[]{headerFooter.getFirstFooterSubrange(),
                headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange()};
        handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);

        // Handle any pictures that we haven't output yet
        for (Picture p = pictures.nextUnclaimed(); p != null; ) {
            handlePictureCharacterRun(
                    null, p, pictures, xhtml
            );
            p = pictures.nextUnclaimed();
        }

        // Handle any embeded office documents
        try {
            DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
            for (Entry entry : op) {
                if (entry.getName().startsWith("_")
                        && entry instanceof DirectoryEntry) {
                    handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
                }
            }
        } catch (FileNotFoundException e) {
        }
    }

    private void extractSavedByMetadata(HWPFDocument document) {
        SavedByTable savedByTable = document.getSavedByTable();
        if (savedByTable == null) {
            return;
        }
        for (SavedByEntry sbe : savedByTable.getEntries()) {
            metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, sbe.getSaveLocation());
        }
    }

    private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document,
                                    PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml)
            throws SAXException, IOException, TikaException {
        if (countParagraphs(ranges) > 0) {
            xhtml.startElement("div", "class", type);
            ListManager listManager = new ListManager(document);
            for (Range r : ranges) {
                if (r != null) {
                    for (int i = 0; i < r.numParagraphs(); i++) {
                        Paragraph p = r.getParagraph(i);

                        i += handleParagraph(p, 0, r, document,
                                FieldsDocumentPart.HEADER, pictures, pictureTable, listManager, xhtml);
                    }
                }
            }
            xhtml.endElement("div");
        }
    }

    private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
                                FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager,
                                XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
        // Note - a poi bug means we can't currently properly recurse
        //  into nested tables, so currently we don't
        if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
            Table t = r.getTable(p);
            xhtml.startElement("table");
            xhtml.startElement("tbody");
            for (int rn = 0; rn < t.numRows(); rn++) {
                TableRow row = t.getRow(rn);
                xhtml.startElement("tr");
                for (int cn = 0; cn < row.numCells(); cn++) {
                    TableCell cell = row.getCell(cn);
                    xhtml.startElement("td");

                    for (int pn = 0; pn < cell.numParagraphs(); pn++) {
                        Paragraph cellP = cell.getParagraph(pn);
                        handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml);
                    }
                    xhtml.endElement("td");
                }
                xhtml.endElement("tr");
            }
            xhtml.endElement("tbody");
            xhtml.endElement("table");
            return (t.numParagraphs() - 1);
        }

        String text = p.text();
        if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
            // Skip empty paragraphs
            return 0;
        }

        TagAndStyle tas;
        String numbering = null;

        if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
            StyleDescription style =
                    document.getStyleSheet().getStyleDescription(p.getStyleIndex());
            if (style != null && style.getName() != null && style.getName().length() > 0) {
                if (p.isInList()) {
                    numbering = listManager.getFormattedNumber(p);
                }
                tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
            } else {
                tas = new TagAndStyle("p", null);
            }
        } else {
            tas = new TagAndStyle("p", null);
        }

        if (tas.getStyleClass() != null) {
            xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
        } else {
            xhtml.startElement(tas.getTag());
        }

        if (numbering != null) {
            xhtml.characters(numbering);
        }

        for (int j = 0; j < p.numCharacterRuns(); j++) {
            CharacterRun cr = p.getCharacterRun(j);

            // FIELD_BEGIN_MARK:
            if (cr.text().getBytes(UTF_8)[0] == 0x13) {
                Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
                // 58 is an embedded document
                // 56 is a document link
                if (field != null && (field.getType() == 58 || field.getType() == 56)) {
                    // Embedded Object: add a 
so consumer can see where // in the main text each embedded document // occurred: String id = "_unknown_id"; //this can return null (TIKA-1956) CharacterRun mscr = field.getMarkSeparatorCharacterRun(r); if (mscr != null) { id = "_" + mscr.getPicOffset(); } AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", id); xhtml.startElement("div", attributes); xhtml.endElement("div"); } } if (cr.text().equals("\u0013")) { j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml); } else if (cr.text().startsWith("\u0008")) { // Floating Picture(s) for (int pn = 0; pn < cr.text().length(); pn++) { // Assume they're in the order from the unclaimed list... Picture picture = pictures.nextUnclaimed(); // Output handlePictureCharacterRun(cr, picture, pictures, xhtml); } } else if (pictureTable.hasPicture(cr)) { // Inline Picture Picture picture = pictures.getFor(cr); handlePictureCharacterRun(cr, picture, pictures, xhtml); } else { handleCharacterRun(cr, tas.isHeading(), xhtml); } } // Close any still open style tags if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; } if (curItalic) { xhtml.endElement("i"); curItalic = false; } if (curBold) { xhtml.endElement("b"); curBold = false; } xhtml.endElement(tas.getTag()); return 0; } private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml) throws SAXException { // Skip trailing newlines if (!isRendered(cr) || cr.text().equals("\r")) return; if (!skipStyling) { if (cr.isBold() != curBold) { // Enforce nesting -- must close s and i tags if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; } if (curItalic) { xhtml.endElement("i"); curItalic = false; } if (cr.isBold()) { xhtml.startElement("b"); } else { xhtml.endElement("b"); } curBold = cr.isBold(); } if (cr.isItalic() != curItalic) { // Enforce nesting -- must close s tag if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; } if (cr.isItalic()) { xhtml.startElement("i"); } else { xhtml.endElement("i"); } curItalic = cr.isItalic(); } if (cr.isStrikeThrough() != curStrikeThrough) { if (cr.isStrikeThrough()) { xhtml.startElement("s"); } else { xhtml.endElement("s"); } curStrikeThrough = cr.isStrikeThrough(); } } // Clean up the text String text = cr.text(); text = text.replace('\r', '\n'); if (text.endsWith("\u0007")) { // Strip the table cell end marker text = text.substring(0, text.length() - 1); } // Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters: // Non-breaking hyphens are returned as char 30 text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN); // Non-required hyphens to zero-width space text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE); // Control characters as line break text = text.replaceAll("[\u0000-\u001f]", "\n"); xhtml.characters(text); } /** * Can be \13..text..\15 or \13..control..\14..text..\15 . * Nesting is allowed */ private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling, PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException { List controls = new ArrayList(); List texts = new ArrayList(); boolean has14 = false; // Split it into before and after the 14 int i; for (i = index + 1; i < p.numCharacterRuns(); i++) { CharacterRun cr = p.getCharacterRun(i); if (cr.text().equals("\u0013")) { // Nested, oh joy... int increment = handleSpecialCharacterRuns(p, i + 1, skipStyling, pictures, xhtml); i += increment; } else if (cr.text().equals("\u0014")) { has14 = true; } else if (cr.text().equals("\u0015")) { if (!has14) { texts = controls; controls = new ArrayList(); } break; } else { if (has14) { texts.add(cr); } else { controls.add(cr); } } } // Do we need to do something special with this? if (controls.size() > 0) { String text = controls.get(0).text(); for (int j = 1; j < controls.size(); j++) { text += controls.get(j).text(); } if ((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK")) && text.indexOf('"') > -1) { int start = text.indexOf('"') + 1; int end = findHyperlinkEnd(text, start); String url = ""; if (start >= 0 && start < end && end <= text.length()) { url = text.substring(start, end); } xhtml.startElement("a", "href", url); closeStyleElements(skipStyling, xhtml); for (CharacterRun cr : texts) { handleCharacterRun(cr, skipStyling, xhtml); } closeStyleElements(skipStyling, xhtml); xhtml.endElement("a"); } else { // Just output the text ones for (CharacterRun cr : texts) { if (pictures.hasPicture(cr)) { Picture picture = pictures.getFor(cr); handlePictureCharacterRun(cr, picture, pictures, xhtml); } else { handleCharacterRun(cr, skipStyling, xhtml); } } } } else { // We only had text // Output as-is for (CharacterRun cr : texts) { handleCharacterRun(cr, skipStyling, xhtml); } } // Tell them how many to skip over return i - index; } private void closeStyleElements(boolean skipStyling, XHTMLContentHandler xhtml) throws SAXException { if (skipStyling) { return; } if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; } if (curItalic) { xhtml.endElement("i"); curItalic = false; } if (curBold) { xhtml.endElement("b"); curBold = false; } } //temporary work around for TIKA-1512 private int findHyperlinkEnd(String text, int start) { int end = text.lastIndexOf('"'); if (end > start) { return end; } end = text.lastIndexOf('\u201D');//smart right double quote if (end > start) { return end; } end = text.lastIndexOf('\r'); if (end > start) { return end; } //if nothing so far, take the full length of the string //If the full string is > 256 characters, it appears //that the url is truncated in the .doc file. This //will return the value as it is in the file, which //may be incorrect; but it is the same behavior as opening //the link in MSWord. //This code does not currently check that length is actually >= 256. //we might want to add that? return text.length(); } private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { if (!isRendered(cr) || picture == null) { // Oh dear, we've run out... // Probably caused by multiple \u0008 images referencing // the same real image return; } // Which one is it? String extension = picture.suggestFileExtension(); int pictureNumber = pictures.pictureNumber(picture); // Make up a name for the picture // There isn't one in the file, but we need to be able to reference // the picture from the img tag and the embedded resource String filename = "image" + pictureNumber + (extension.length() > 0 ? "." + extension : ""); // Grab the mime type for the picture String mimeType = picture.getMimeType(); // Output the img tag AttributesImpl attr = new AttributesImpl(); attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename); attr.addAttribute("", "alt", "alt", "CDATA", filename); xhtml.startElement("img", attr); xhtml.endElement("img"); // Have we already output this one? // (Only expose each individual image once) if (!pictures.hasOutput(picture)) { TikaInputStream stream = TikaInputStream.get(picture.getContent()); handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false); pictures.recordOutput(picture); } } /** * Outputs a section of text if the given text is non-empty. * * @param xhtml XHTML content handler * @param section the class of the <div/> section emitted * @param text text to be emitted, if any * @throws SAXException if an error occurs */ private void addTextIfAny( XHTMLContentHandler xhtml, String section, String text) throws SAXException { if (text != null && text.length() > 0) { xhtml.startElement("div", "class", section); xhtml.element("p", text); xhtml.endElement("div"); } } protected void parseWord6( NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { parseWord6(filesystem.getRoot(), xhtml); } protected void parseWord6( DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HWPFOldDocument doc = new HWPFOldDocument(root); Word6Extractor extractor = new Word6Extractor(doc); for (String p : extractor.getParagraphText()) { xhtml.element("p", p); } } /** * Determines if character run should be included in the extraction. * * @param cr character run. * @return true if character run should be included in extraction. */ private boolean isRendered(final CharacterRun cr) { if (cr == null) { return false; } return !cr.isMarkedDeleted() || (cr.isMarkedDeleted() && extractDeletedContent); } public static class TagAndStyle { private String tag; private String styleClass; public TagAndStyle(String tag, String styleClass) { this.tag = tag; this.styleClass = styleClass; } public String getTag() { return tag; } public String getStyleClass() { return styleClass; } public boolean isHeading() { return tag.length() == 2 && tag.startsWith("h"); } } /** * Provides access to the pictures both by offset, iteration * over the un-claimed, and peeking forward */ private static class PicturesSource { private PicturesTable picturesTable; private Set output = new HashSet(); private Map lookup; private List nonU1based; private List all; private int pn = 0; private PicturesSource(HWPFDocument doc) { picturesTable = doc.getPicturesTable(); all = picturesTable.getAllPictures(); // Build the Offset-Picture lookup map lookup = new HashMap(); for (Picture p : all) { lookup.put(p.getStartOffset(), p); } // Work out which Pictures aren't referenced by // a \u0001 in the main text // These are \u0008 escher floating ones, ones // found outside the normal text, and who // knows what else... nonU1based = new ArrayList(); nonU1based.addAll(all); Range r = doc.getRange(); for (int i = 0; i < r.numCharacterRuns(); i++) { CharacterRun cr = r.getCharacterRun(i); if (picturesTable.hasPicture(cr)) { Picture p = getFor(cr); int at = nonU1based.indexOf(p); nonU1based.set(at, null); } } } private boolean hasPicture(CharacterRun cr) { return picturesTable.hasPicture(cr); } private void recordOutput(Picture picture) { output.add(picture); } private boolean hasOutput(Picture picture) { return output.contains(picture); } private int pictureNumber(Picture picture) { return all.indexOf(picture) + 1; } private Picture getFor(CharacterRun cr) { return lookup.get(cr.getPicOffset()); } /** * Return the next unclaimed one, used towards * the end */ private Picture nextUnclaimed() { Picture p = null; while (pn < nonU1based.size()) { p = nonU1based.get(pn); pn++; if (p != null) return p; } return null; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy