All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.rtf.TextExtractor Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.parser.rtf;

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.Calendar;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Locale;
import java.util.Map;
import java.util.Stack;
import java.util.TimeZone;

import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.CharsetUtils;
import org.xml.sax.SAXException;

/* Tokenizes and performs a "shallow" parse of the RTF
 * document, just enough to properly decode the text.
 *
 * TODO: we should cutover to a "real" tokenizer (eg JFlex);
 * it should give better perf, by replacing the excessive
 * "else if" string compares with FSA traversal. */

final class TextExtractor {

    private static final char SPACE = ' ';
    private static final String P = "p";
    private static final String LI = "li";
    private static final String OL = "ol";
    private static final String UL = "ul";

    private static final Charset ASCII = Charset.forName("US-ASCII");
    private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
    private static final Charset MAC_ROMAN = getCharset("MacRoman");
    private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
    private static final Charset WINDOWS_57011 = getCharset("windows-57011");
    private static final Charset WINDOWS_57010 = getCharset("windows-57010");
    private static final Charset WINDOWS_57009 = getCharset("windows-57009");
    private static final Charset WINDOWS_57008 = getCharset("windows-57008");
    private static final Charset WINDOWS_57007 = getCharset("windows-57007");
    private static final Charset WINDOWS_57006 = getCharset("windows-57006");
    private static final Charset WINDOWS_57005 = getCharset("windows-57005");
    private static final Charset WINDOWS_57004 = getCharset("windows-57004");
    private static final Charset WINDOWS_57003 = getCharset("windows-57003");
    private static final Charset X_ISCII91 = getCharset("x-ISCII91");
    private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope");
    private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
    private static final Charset X_JOHAB = getCharset("x-Johab");
    private static final Charset CP12582 = getCharset("CP1258");
    private static final Charset CP12572 = getCharset("CP1257");
    private static final Charset CP12562 = getCharset("CP1256");
    private static final Charset CP12552 = getCharset("CP1255");
    private static final Charset CP12542 = getCharset("CP1254");
    private static final Charset CP12532 = getCharset("CP1253");
    private static final Charset CP1252 = getCharset("CP1252");
    private static final Charset CP12512 = getCharset("CP1251");
    private static final Charset CP12502 = getCharset("CP1250");
    private static final Charset CP950 = getCharset("CP950");
    private static final Charset CP949 = getCharset("CP949");
    private static final Charset MS9362 = getCharset("MS936");
    private static final Charset MS8742 = getCharset("MS874");
    private static final Charset CP866 = getCharset("CP866");
    private static final Charset CP865 = getCharset("CP865");
    private static final Charset CP864 = getCharset("CP864");
    private static final Charset CP863 = getCharset("CP863");
    private static final Charset CP862 = getCharset("CP862");
    private static final Charset CP860 = getCharset("CP860");
    private static final Charset CP852 = getCharset("CP852");
    private static final Charset CP8502 = getCharset("CP850");
    private static final Charset CP819 = getCharset("CP819");
    private static final Charset WINDOWS_720 = getCharset("windows-720");
    private static final Charset WINDOWS_711 = getCharset("windows-711");
    private static final Charset WINDOWS_710 = getCharset("windows-710");
    private static final Charset WINDOWS_709 = getCharset("windows-709");
    private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
    private static final Charset CP4372 = getCharset("CP437");
    private static final Charset CP850 = getCharset("cp850");
    private static final Charset CP437 = getCharset("cp437");
    private static final Charset MS874 = getCharset("ms874");
    private static final Charset CP1257 = getCharset("cp1257");
    private static final Charset CP1256 = getCharset("cp1256");
    private static final Charset CP1255 = getCharset("cp1255");
    private static final Charset CP1258 = getCharset("cp1258");
    private static final Charset CP1254 = getCharset("cp1254");
    private static final Charset CP1253 = getCharset("cp1253");
    private static final Charset MS950 = getCharset("ms950");
    private static final Charset MS936 = getCharset("ms936");
    private static final Charset MS1361 = getCharset("ms1361");
    private static final Charset MS932 = getCharset("MS932");
    private static final Charset CP1251 = getCharset("cp1251");
    private static final Charset CP1250 = getCharset("cp1250");
    private static final Charset MAC_THAI = getCharset("MacThai");
    private static final Charset MAC_TURKISH = getCharset("MacTurkish");
    private static final Charset MAC_GREEK = getCharset("MacGreek");
    private static final Charset MAC_ARABIC = getCharset("MacArabic");
    private static final Charset MAC_HEBREW = getCharset("MacHebrew");
    private static final Charset JOHAB = getCharset("johab");
    private static final Charset BIG5 = getCharset("Big5");
    private static final Charset GB2312 = getCharset("GB2312");
    private static final Charset MS949 = getCharset("ms949");
    // The RTF doc has a "font table" that assigns ords
    // (f0, f1, f2, etc.) to fonts and charsets, using the
    // \fcharsetN control word.  This mapping maps from the
    // N to corresponding Java charset:
    private static final Map FCHARSET_MAP =
            new HashMap();
    // The RTF may specify the \ansicpgN charset in the
    // header; this maps the N to the corresponding Java
    // character set:
    private static final Map ANSICPG_MAP =
            new HashMap();

    static {
        FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
        // charset 1 is Default
        // charset 2 is Symbol

        FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
        FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
        FCHARSET_MAP.put(79, MS949); // Mac Hangul
        FCHARSET_MAP.put(80, GB2312); // Mac GB2312
        FCHARSET_MAP.put(81, BIG5); // Mac Big5
        FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
        FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
        FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
        FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
        FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
        FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
        FCHARSET_MAP.put(88, CP1250); // Mac East Europe
        FCHARSET_MAP.put(89, CP1251); // Mac Russian

        FCHARSET_MAP.put(128, MS932); // Shift JIS
        FCHARSET_MAP.put(129, MS949); // Hangul
        FCHARSET_MAP.put(130, MS1361); // Johab
        FCHARSET_MAP.put(134, MS936); // GB2312
        FCHARSET_MAP.put(136, MS950); // Big5
        FCHARSET_MAP.put(161, CP1253); // Greek
        FCHARSET_MAP.put(162, CP1254); // Turkish
        FCHARSET_MAP.put(163, CP1258); // Vietnamese
        FCHARSET_MAP.put(177, CP1255); // Hebrew
        FCHARSET_MAP.put(178, CP1256); // Arabic
        // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
        // FCHARSET_MAP.put( 180, "" ); // Arabic user
        // FCHARSET_MAP.put( 181, "" ); // Hebrew user
        FCHARSET_MAP.put(186, CP1257); // Baltic

        FCHARSET_MAP.put(204, CP1251); // Russian
        FCHARSET_MAP.put(222, MS874); // Thai
        FCHARSET_MAP.put(238, CP1250); // Eastern European
        FCHARSET_MAP.put(254, CP437); // PC 437
        FCHARSET_MAP.put(255, CP850); // OEM
    }

    static {
        ANSICPG_MAP.put(437, CP4372);   // US IBM
        ANSICPG_MAP.put(708, ISO_8859_6);   // Arabic (ASMO 708)

        ANSICPG_MAP.put(709, WINDOWS_709);  // Arabic (ASMO 449+, BCON V4)
        ANSICPG_MAP.put(710, WINDOWS_710);  // Arabic (transparent Arabic)
        ANSICPG_MAP.put(710, WINDOWS_711);  // Arabic (Nafitha Enhanced)
        ANSICPG_MAP.put(710, WINDOWS_720);  // Arabic (transparent ASMO)
        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)

        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
        ANSICPG_MAP.put(850, CP8502);  // IBM Multilingual
        ANSICPG_MAP.put(852, CP852);  // Eastern European
        ANSICPG_MAP.put(860, CP860);  // Portuguese
        ANSICPG_MAP.put(862, CP862);  // Hebrew
        ANSICPG_MAP.put(863, CP863);  // French Canadian
        ANSICPG_MAP.put(864, CP864);  // Arabic
        ANSICPG_MAP.put(865, CP865);  // Norwegian
        ANSICPG_MAP.put(866, CP866);  // Soviet Union
        ANSICPG_MAP.put(874, MS8742);  // Thai
        ANSICPG_MAP.put(932, MS932);  // Japanese
        ANSICPG_MAP.put(936, MS9362);  // Simplified Chinese
        ANSICPG_MAP.put(949, CP949);  // Korean
        ANSICPG_MAP.put(950, CP950);  // Traditional Chinese
        ANSICPG_MAP.put(1250, CP12502);  // Eastern European
        ANSICPG_MAP.put(1251, CP12512);  // Cyrillic
        ANSICPG_MAP.put(1252, CP1252);  // Western European
        ANSICPG_MAP.put(1253, CP12532);  // Greek
        ANSICPG_MAP.put(1254, CP12542);  // Turkish
        ANSICPG_MAP.put(1255, CP12552);  // Hebrew
        ANSICPG_MAP.put(1256, CP12562);  // Arabic
        ANSICPG_MAP.put(1257, CP12572);  // Baltic
        ANSICPG_MAP.put(1258, CP12582);  // Vietnamese
        ANSICPG_MAP.put(1361, X_JOHAB);  // Johab
        ANSICPG_MAP.put(10000, MAC_ROMAN);  // Mac Roman
        ANSICPG_MAP.put(10001, SHIFT_JIS);  // Mac Japan
        ANSICPG_MAP.put(10004, MAC_ARABIC);  // Mac Arabic
        ANSICPG_MAP.put(10005, MAC_HEBREW);  // Mac Hebrew
        ANSICPG_MAP.put(10006, MAC_GREEK);  // Mac Hebrew
        ANSICPG_MAP.put(10007, MAC_CYRILLIC);  // Mac Cyrillic
        ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE);  // MAC Latin2
        ANSICPG_MAP.put(10081, MAC_TURKISH);  // Mac Turkish
        ANSICPG_MAP.put(57002, X_ISCII91);   // Devanagari

        // TODO: in theory these other charsets are simple
        // shifts off of Devanagari, so we could impl that
        // here:
        ANSICPG_MAP.put(57003, WINDOWS_57003);   // Bengali
        ANSICPG_MAP.put(57004, WINDOWS_57004);   // Tamil
        ANSICPG_MAP.put(57005, WINDOWS_57005);   // Telugu
        ANSICPG_MAP.put(57006, WINDOWS_57006);   // Assamese
        ANSICPG_MAP.put(57007, WINDOWS_57007);   // Oriya
        ANSICPG_MAP.put(57008, WINDOWS_57008);   // Kannada
        ANSICPG_MAP.put(57009, WINDOWS_57009);   // Malayalam
        ANSICPG_MAP.put(57010, WINDOWS_57010);   // Gujariti
        ANSICPG_MAP.put(57011, WINDOWS_57011);   // Punjabi
    }

    // Used when we decode bytes -> chars using CharsetDecoder:
    private final char[] outputArray = new char[128];
    private final Buffer outputCharBuffer = CharBuffer.wrap(outputArray);
    // Holds the font table from this RTF doc, mapping
    // the font number (from \fN control word) to the
    // corresponding charset:
    private final Map fontToCharset =
            new HashMap();
    // Group stack: when we open a new group, we push
    // the previous group state onto the stack; when we
    // close the group, we restore it
    private final LinkedList groupStates = new LinkedList();
    private final StringBuilder pendingBuffer = new StringBuilder();
    private final XHTMLContentHandler out;
    private final Metadata metadata;
    private final RTFEmbObjHandler embObjHandler;
    // How many next ansi chars we should skip; this
    // is 0 except when we are still in the "ansi
    // shadow" after seeing a unicode escape, at which
    // point it's set to the last ucN skip we had seen:
    int ansiSkip = 0;
    private int written = 0;
    // Hold pending bytes (encoded in the current charset)
    // for text output:
    private byte[] pendingBytes = new byte[16];
    private int pendingByteCount;
    private Buffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
    // Holds pending chars for text output
    private char[] pendingChars = new char[10];
    private int pendingCharCount;
    // Holds chars for a still-being-tokenized control word
    private byte[] pendingControl = new byte[10];
    private int pendingControlCount;
    // Reused when possible:
    private CharsetDecoder decoder;
    private Charset lastCharset;
    private Charset globalCharset = WINDOWS_1252;
    private int globalDefaultFont = -1;
    private int curFontID = -1;
    // Current group state; in theory this initial
    // GroupState is unused because the RTF doc should
    // immediately open the top group (start with {):
    private GroupState groupState = new GroupState();
    private boolean inHeader = true;
    //0 not yet in font table, 1 in font table, 2 have processed font table
    private int fontTableState = 0;
    //depth at which the font table started
    private int fontTableDepth;
    // Non null if we are processing metadata (title,
    // keywords, etc.) inside the info group:
    private Property nextMetaData;
    private boolean inParagraph;
    // Non-zero if we are processing inside a field destination:
    private int fieldState;
    // Non-zero list index
    private int pendingListEnd;
    private Map listTable = new HashMap();
    private Map listOverrideTable = new HashMap();
    private Map currentListTable;
    private ListDescriptor currentList;
    private int listTableLevel = -1;
    private boolean ignoreListMarkup;
    // Non-null if we've seen the url for a HYPERLINK but not yet
    // its text:
    private String pendingURL;
    // Used to process the sub-groups inside the upr
    // group:
    private int uprState = -1;
    // Used when extracting CREATION date:
    private int year, month, day, hour, minute;

    //This keeps track of the following elements as they are
    //written to the handler: p, li, ol, ul
    //This tries to prevent malformed tag orders in the RTF
    //e.g. 

//from generating malformed xml tags. (TIKA-2899) //This may conceal problems with our parser. //TODO: // 1) do we need to add all elements, a, b, i, etc. // 2) are we doing the right thing by ignoring an element // if its match doesn't pop off the stack...or should // we pop all at the first failure. private Stack paragraphStack = new Stack<>(); //this is an arbitrary limit on the size of the stack //to defend against DoS with memory consumption private int maxStackSize = 1000; public TextExtractor(XHTMLContentHandler out, Metadata metadata, RTFEmbObjHandler embObjHandler) { this.metadata = metadata; this.out = out; this.embObjHandler = embObjHandler; } private static Charset getCharset(String name) { try { return CharsetUtils.forName(name); } catch (IllegalArgumentException e) { return ASCII; } } protected static boolean isHexChar(int ch) { return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'); } private static boolean isAlpha(int ch) { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); } private static boolean isDigit(int ch) { return ch >= '0' && ch <= '9'; } protected static int hexValue(int ch) { if (ch >= '0' && ch <= '9') { return ch - '0'; } else if (ch >= 'a' && ch <= 'z') { return 10 + (ch - 'a'); } else { assert ch >= 'A' && ch <= 'Z'; return 10 + (ch - 'A'); } } public boolean isIgnoringLists() { return ignoreListMarkup; } public void setIgnoreListMarkup(boolean ignore) { this.ignoreListMarkup = ignore; } // Push pending bytes or pending chars: private void pushText() throws IOException, SAXException, TikaException { if (pendingByteCount != 0) { assert pendingCharCount == 0; pushBytes(); } else { pushChars(); } } // Buffers the byte (unit in the current charset) for // output: private void addOutputByte(int b) throws IOException, SAXException, TikaException { assert b >= 0 && b < 256 : "byte value out of range: " + b; if (pendingCharCount != 0) { pushChars(); } if (groupState.pictDepth > 0) { embObjHandler.writeMetadataChar((char) b); } else { // Save the byte in pending buffer: if (pendingByteCount == pendingBytes.length) { // Gradual but exponential growth: final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)]; System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length); pendingBytes = newArray; pendingByteBuffer = ByteBuffer.wrap(pendingBytes); } pendingBytes[pendingByteCount++] = (byte) b; } } // Buffers a byte as part of a control word: private void addControl(int b) { assert isAlpha(b); // Save the byte in pending buffer: if (pendingControlCount == pendingControl.length) { // Gradual but exponential growth: final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)]; System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length); pendingControl = newArray; } pendingControl[pendingControlCount++] = (byte) b; } // Buffers a UTF16 code unit for output private void addOutputChar(char ch) throws IOException, SAXException, TikaException { if (pendingByteCount != 0) { pushBytes(); } if (inHeader || fieldState == 1) { pendingBuffer.append(ch); } else if (groupState.sn == true || groupState.sv == true) { embObjHandler.writeMetadataChar(ch); } else { if (pendingCharCount == pendingChars.length) { // Gradual but exponential growth: final char[] newArray = new char[(int) (pendingChars.length * 1.25)]; System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length); pendingChars = newArray; } pendingChars[pendingCharCount++] = ch; } } // Shallow parses the entire doc, writing output to // this.out and this.metadata public void extract(InputStream in) throws IOException, SAXException, TikaException { // in = new FilterInputStream(in) { // public int read() throws IOException { // int r = super.read(); // System.out.write(r); // System.out.flush(); // return r; // } // public int read(byte b[], int off, int len) throws IOException { // int r = super.read(b, off, len); // System.out.write(b, off, r); // System.out.flush(); // return r; // } // }; extract(new PushbackInputStream(in, 2)); } private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException { out.startDocument(); while (true) { final int b = in.read(); if (b == -1) { break; } else if (b == '\\') { parseControlToken(in); } else if (b == '{') { pushText(); processGroupStart(in); } else if (b == '}') { pushText(); processGroupEnd(); if (groupStates.isEmpty()) { // parsed document closing brace break; } } else if (groupState.objdata == true || groupState.pictDepth == 1) { embObjHandler.writeHexChar(b); } else if (b != '\r' && b != '\n' && (!groupState.ignore || nextMetaData != null || groupState.sn == true || groupState.sv == true)) { // Linefeed and carriage return are not // significant if (ansiSkip != 0) { ansiSkip--; } else { addOutputByte(b); } } } endParagraph(false); //close out whatever tags were left while (paragraphStack.size() > 0) { end(paragraphStack.pop()); } out.endDocument(); } private void parseControlToken(PushbackInputStream in) throws IOException, SAXException, TikaException { int b = in.read(); if (b == '\'') { // escaped hex char parseHexChar(in); } else if (isAlpha(b)) { // control word parseControlWord((char) b, in); } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') { // escaped char addOutputByte(b); } else if (b != -1) { // control symbol, eg \* or \~ processControlSymbol((char) b); } } private void parseHexChar(PushbackInputStream in) throws IOException, SAXException, TikaException { int hex1 = in.read(); if (!isHexChar(hex1)) { // DOC ERROR (malformed hex escape): ignore in.unread(hex1); return; } int hex2 = in.read(); if (!isHexChar(hex2)) { // TODO: log a warning here, somehow? // DOC ERROR (malformed hex escape): // ignore in.unread(hex2); return; } if (ansiSkip != 0) { // Skip this ansi char since we are // still in the shadow of a unicode // escape: ansiSkip--; } else { // Unescape: addOutputByte(16 * hexValue(hex1) + hexValue(hex2)); } } private void parseControlWord(int firstChar, PushbackInputStream in) throws IOException, SAXException, TikaException { addControl(firstChar); int b = in.read(); while (isAlpha(b)) { addControl(b); b = in.read(); } boolean hasParam = false; boolean negParam = false; if (b == '-') { negParam = true; hasParam = true; b = in.read(); } int param = 0; while (isDigit(b)) { param *= 10; param += (b - '0'); hasParam = true; b = in.read(); } // space is consumed as part of the // control word, but is not added to the // control word if (b != ' ') { in.unread(b); } if (hasParam) { if (negParam) { param = -param; } processControlWord(param, in); } else { processControlWord(); } pendingControlCount = 0; } private void lazyStartParagraph() throws IOException, SAXException, TikaException { boolean localInParagraph = inParagraph; if (paragraphStack.size() > 0 && paragraphStack.contains(P)) { localInParagraph = true; } if (!localInParagraph) { // Ensure order if (groupState.italic) { end("i"); } if (groupState.bold) { end("b"); } if (pendingListEnd != 0 && groupState.list != pendingListEnd) { endList(pendingListEnd); pendingListEnd = 0; } if (inList() && pendingListEnd != groupState.list) { startList(groupState.list); } if (inList()) { start(LI); pushParagraphTag(LI); } else { start(P); pushParagraphTag(P); } // Ensure order if (groupState.bold) { start("b"); } if (groupState.italic) { start("i"); } inParagraph = true; } } private void pushParagraphTag(String tag) { if (paragraphStack.size() < maxStackSize) { paragraphStack.push(tag); } else { //ignore. Something is very, very wrong... } } private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException { pushText(); //maintain consecutive new lines if (!inParagraph) { lazyStartParagraph(); } if (inParagraph || paragraphStack.size() > 0) { if (groupState.italic) { end("i"); groupState.italic = preserveStyles; } if (groupState.bold) { end("b"); groupState.bold = preserveStyles; } boolean badTagAlignment = false; if (inList()) { if (paragraphStack.size() > 0) { String lastP = paragraphStack.pop(); if (lastP.equals(LI)) { end(LI); } else { pushParagraphTag(lastP); badTagAlignment = true; } } else { //there should have been a starting li } } else { if (paragraphStack.size() > 0) { String lastP = paragraphStack.pop(); if (P.equals(lastP)) { end(P); } else { pushParagraphTag(lastP); badTagAlignment = true; } } } //if there was a failure in tag alignment, //dump all tags and start fresh. if (badTagAlignment) { while (paragraphStack.size() > 0) { end(paragraphStack.pop()); } } if (preserveStyles && (groupState.bold || groupState.italic)) { start(P); pushParagraphTag(P); if (groupState.bold) { start("b"); } if (groupState.italic) { start("i"); } inParagraph = true; } else { inParagraph = false; } } // Ensure closing the list at document end if (!preserveStyles && pendingListEnd != 0) { endList(pendingListEnd); pendingListEnd = 0; } } // Push pending UTF16 units to out ContentHandler private void pushChars() throws IOException, SAXException, TikaException { if (pendingCharCount != 0) { lazyStartParagraph(); out.characters(pendingChars, 0, pendingCharCount); pendingCharCount = 0; } } // Decodes the buffered bytes in pendingBytes // into UTF16 code units, and sends the characters // to the out ContentHandler, if we are in the body, // else appends the characters to the pendingBuffer private void pushBytes() throws IOException, SAXException, TikaException { if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) { final CharsetDecoder decoder = getDecoder(); pendingByteBuffer.limit(pendingByteCount); assert pendingByteBuffer.position() == 0; assert outputCharBuffer.position() == 0; while (true) { // We pass true for endOfInput because, when // we are called, we should have seen a // complete sequence of characters for this // charset: final CoderResult result = decoder.decode((ByteBuffer)pendingByteBuffer, (CharBuffer) outputCharBuffer, true); final int pos = outputCharBuffer.position(); if (pos > 0) { if (inHeader || fieldState == 1) { pendingBuffer.append(outputArray, 0, pos); } else { lazyStartParagraph(); out.characters(outputArray, 0, pos); } outputCharBuffer.position(0); } if (result == CoderResult.UNDERFLOW) { break; } } while (true) { final CoderResult result = decoder.flush((CharBuffer) outputCharBuffer); final int pos = outputCharBuffer.position(); if (pos > 0) { if (inHeader || fieldState == 1) { pendingBuffer.append(outputArray, 0, pos); } else { lazyStartParagraph(); out.characters(outputArray, 0, pos); } outputCharBuffer.position(0); } if (result == CoderResult.UNDERFLOW) { break; } } // Reset for next decode decoder.reset(); pendingByteBuffer.position(0); } pendingByteCount = 0; } // NOTE: s must be ascii alpha only private boolean equals(String s) { if (pendingControlCount != s.length()) { return false; } for (int idx = 0; idx < pendingControlCount; idx++) { assert isAlpha(s.charAt(idx)); if (((byte) s.charAt(idx)) != pendingControl[idx]) { return false; } } return true; } private void processControlSymbol(char ch) throws IOException, SAXException, TikaException { switch (ch) { case '~': // Non-breaking space -> unicode NON-BREAKING SPACE addOutputChar('\u00a0'); break; case '*': // Ignorable destination (control words defined after // the 1987 RTF spec). These are already handled by // processGroupStart() break; case '-': // Optional hyphen -> unicode SOFT HYPHEN addOutputChar('\u00ad'); break; case '_': // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN addOutputChar('\u2011'); break; default: break; } } private CharsetDecoder getDecoder() throws TikaException { Charset charset = getCharset(); // Common case: charset is same as last time, so // just reuse it: if (lastCharset == null || !charset.equals(lastCharset)) { decoder = charset.newDecoder(); decoder.onMalformedInput(CodingErrorAction.REPLACE); decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); lastCharset = charset; } return decoder; } // Return current charset in-use private Charset getCharset() throws TikaException { // If a specific font (fN) was set, use its charset if (groupState.fontCharset != null) { return groupState.fontCharset; } // Else, if global default font (defN) was set, use that one if (globalDefaultFont != -1 && !inHeader) { Charset cs = fontToCharset.get(globalDefaultFont); if (cs != null) { return cs; } } // Else, use the global charset if (globalCharset == null) { throw new TikaException("unable to determine charset"); } return globalCharset; } // Handle control word that takes a parameter: private void processControlWord(int param, PushbackInputStream in) throws IOException, SAXException, TikaException { // TODO: afN? (associated font number) // TODO: do these alter text output...? /* } else if (equals("stshfdbch")) { // font to be used by default in // style sheet for East Asian chars // arg N is font table entry } else if (equals("stshfloch")) { // font to be used by default in // style sheet for ASCII chars // arg N is font table entry } else if (equals("stshfhich")) { // font to be used by default in // style sheet for High Ansi chars // arg N is font table entry } else if (equals("stshfbi")) { // style sheet for Complex Scripts (BIDI) chars // arg N is font table entry */ // TODO: inefficient that we check equals N times; // we'd get better perf w/ real lexer (eg // JFlex), which uses single-pass FSM to do cmp: if (inHeader) { if (equals("ansicpg")) { // ANSI codepage Charset cs = ANSICPG_MAP.get(param); if (cs != null) { globalCharset = cs; } } else if (equals("deff")) { // Default font globalDefaultFont = param; } else if (equals("nofpages")) { metadata.add(Office.PAGE_COUNT, Integer.toString(param)); } else if (equals("nofwords")) { metadata.add(Office.WORD_COUNT, Integer.toString(param)); } else if (equals("nofchars")) { metadata.add(Office.CHARACTER_COUNT, Integer.toString(param)); } else if (equals("yr")) { year = param; } else if (equals("mo")) { month = param; } else if (equals("dy")) { day = param; } else if (equals("hr")) { hour = param; } else if (equals("min")) { minute = param; } if (fontTableState == 1) { // Still inside font table -- record the // mappings of fN to the fcharset: if (groupState.depth < fontTableDepth) { fontTableState = 2; } else { if (equals("f")) { // Start new font definition curFontID = param; } else if (equals("fcharset")) { Charset cs = FCHARSET_MAP.get(param); if (cs != null) { fontToCharset.put(curFontID, cs); } } } } //if you've already seen the font table, //you aren't in another header item (e.g. styles) //and you see an fX, you're out of the header if (fontTableState == 2 && ! groupState.ignore && equals("f")) { inHeader = false; } if (currentList != null) { if (equals("listid")) { currentList.id = param; currentListTable.put(currentList.id, currentList); } else if (equals("listtemplateid")) { currentList.templateID = param; } else if (equals("levelnfc") || equals("levelnfcn")) { //check to make sure list information isn't corrupt if (listTableLevel > -1 && listTableLevel < currentList.numberType.length) { currentList.numberType[listTableLevel] = param; } } } } else { // In document if (equals("b")) { // b0 assert param == 0; if (groupState.bold) { pushText(); if (groupState.italic) { end("i"); } end("b"); if (groupState.italic) { start("i"); } groupState.bold = false; } } else if (equals("i")) { // i0 assert param == 0; if (groupState.italic) { pushText(); end("i"); groupState.italic = false; } } else if (equals("f")) { // Change current font Charset fontCharset = fontToCharset.get(param); // Push any buffered text before changing // font: pushText(); if (fontCharset != null) { groupState.fontCharset = fontCharset; } else { // DOC ERROR: font change referenced a // non-table'd font number // TODO: log a warning? Throw an exc? groupState.fontCharset = null; } } else if (equals("ls")) { groupState.list = param; } else if (equals("lslvl")) { groupState.listLevel = param; } } // Process unicode escape. This can appear in doc // or in header, since the metadata (info) fields // in the header can be unicode escaped as well: if (equals("u")) { // Unicode escape if (!groupState.ignore || groupState.sv || groupState.sn) { final char utf16CodeUnit = (char) (param & 0xffff); addOutputChar(utf16CodeUnit); } // After seeing a unicode escape we must // skip the next ucSkip ansi chars (the // "unicode shadow") ansiSkip = groupState.ucSkip; } else if (equals("uc")) { // Change unicode shadow length groupState.ucSkip = param; } else if (equals("bin")) { if (param >= 0) { if (groupState.pictDepth == 1) { try { embObjHandler.writeBytes(in, param); } catch (IOException|TikaException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); embObjHandler.reset(); } } else { IOUtils.skipFully(in, param); } } else { // log some warning? } } } private boolean inList() { return !ignoreListMarkup && groupState.list != 0; } /** * Marks the current list as pending to end. This is done to be able to merge list items of * the same list within the same enclosing list tag (ie. either "ul", or * "ol"). */ private void pendingListEnd() { pendingListEnd = groupState.list; groupState.list = 0; } /** * Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to determine the list * type for the given listID. * * @param listID The ID of the list. * @throws IOException * @throws SAXException * @throws TikaException */ private void endList(int listID) throws IOException, SAXException, TikaException { if (!ignoreListMarkup) { String xl = isUnorderedList(listID) ? UL : OL; if (paragraphStack.size() > 0) { String p = paragraphStack.pop(); if (xl.equals(p)) { end(xl); } } else { //stack as empty, the list was never started } } } /** * Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to determine the list * type for the given listID. * * @param listID The ID of the list. * @throws IOException * @throws SAXException * @throws TikaException */ private void startList(int listID) throws IOException, SAXException, TikaException { if (!ignoreListMarkup) { String xl = isUnorderedList(listID) ? UL : OL; start(xl); pushParagraphTag(xl); } } private boolean isUnorderedList(int listID) { ListDescriptor list = listTable.get(listID); if (list != null) { return list.isUnordered(groupState.listLevel); } return true; } private void end(String tag) throws IOException, SAXException, TikaException { out.endElement(tag); } private void start(String tag) throws IOException, SAXException, TikaException { out.startElement(tag); } // Handle non-parameter control word: private void processControlWord() throws IOException, SAXException, TikaException { if (inHeader) { if (equals("ansi")) { globalCharset = WINDOWS_1252; } else if (equals("pca")) { globalCharset = CP850; } else if (equals("pc")) { globalCharset = CP437; } else if (equals("mac")) { globalCharset = MAC_ROMAN; } if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) { groupState.ignore = true; } else if (equals("listtable")) { currentListTable = listTable; } else if (equals("listoverridetable")) { currentListTable = listOverrideTable; } if (uprState == -1) { // TODO: we can also parse \creatim, \revtim, // \printim, \version, etc. if (equals("author")) { nextMetaData = TikaCoreProperties.CREATOR; } else if (equals("title")) { nextMetaData = TikaCoreProperties.TITLE; } else if (equals("subject")) { // TODO: Move to OO subject in Tika 2.0 nextMetaData = TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT; } else if (equals("keywords")) { nextMetaData = TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT; } else if (equals("category")) { nextMetaData = OfficeOpenXMLCore.CATEGORY; } else if (equals("comment")) { nextMetaData = TikaCoreProperties.COMMENTS; } else if (equals("company")) { nextMetaData = OfficeOpenXMLExtended.COMPANY; } else if (equals("manager")) { nextMetaData = OfficeOpenXMLExtended.MANAGER; } else if (equals("template")) { nextMetaData = OfficeOpenXMLExtended.TEMPLATE; } else if (equals("creatim")) { nextMetaData = TikaCoreProperties.CREATED; } } if (fontTableState == 0) { // Didn't see font table yet if (equals("fonttbl")) { fontTableState = 1; fontTableDepth = groupState.depth; } } else if (fontTableState == 1) { // Inside font table if (groupState.depth < fontTableDepth) { fontTableState = 2; } } // List table handling if (currentListTable != null) { if (equals("list") || equals("listoverride")) { currentList = new ListDescriptor(); listTableLevel = -1; } else if (currentList != null) { if (equals("liststylename")) { currentList.isStyle = true; } else if (equals("listlevel")) { listTableLevel++; } } } if (!groupState.ignore && (equals("par") || equals("pard") || equals("sect") || equals("sectd") || equals("plain") || equals("ltrch") || equals("rtlch") || equals("htmlrtf") || equals("line"))) { inHeader = false; } } else { if (equals("b")) { if (!groupState.bold) { pushText(); lazyStartParagraph(); if (groupState.italic) { // Make sure nesting is always end("i"); } groupState.bold = true; start("b"); if (groupState.italic) { start("i"); } } } else if (equals("i")) { if (!groupState.italic) { pushText(); lazyStartParagraph(); groupState.italic = true; start("i"); } } } final boolean ignored = groupState.ignore; if (equals("pard")) { // Reset styles pushText(); if (groupState.italic) { end("i"); groupState.italic = false; } if (groupState.bold) { end("b"); groupState.bold = false; } if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0)) pendingListEnd(); } } else if (equals("plain")) { if (groupState.italic || groupState.bold) { // Reset styles pushText(); if (groupState.italic) { end("i"); groupState.italic = false; } if (groupState.bold) { end("b"); groupState.bold = false; } } } else if (equals("par")) { if (!ignored) { endParagraph(true); if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0)) pendingListEnd(); } } } else if (equals("shptxt")) { pushText(); // Text inside a shape groupState.ignore = false; } else if (equals("chatn")) { addOutputChar(SPACE); pushText(); // Annotation ID groupState.ignore = false; } else if (equals("atnid")) { addOutputChar(SPACE); pushText(); // Annotation ID groupState.ignore = false; } else if (equals("atnauthor")) { addOutputChar(SPACE); pushText(); // Annotation author groupState.ignore = false; } else if (equals("annotation")) { groupState.annotation = true; pushText(); // Annotation groupState.ignore = false; } else if (equals("listtext")) { groupState.ignore = true; } else if (equals("cell")) { // TODO: we should produce a table output here? //addOutputChar(' '); endParagraph(true); } else if (equals("sp")) { groupState.sp = true; } else if (equals("sn")) { embObjHandler.startSN(); groupState.sn = true; } else if (equals("sv")) { embObjHandler.startSV(); groupState.sv = true; } else if (equals("object")) { pushText(); embObjHandler.setInObject(true); groupState.object = true; } else if (equals("objdata")) { groupState.objdata = true; embObjHandler.startObjData(); } else if (equals("pict")) { pushText(); // TODO: create img tag? but can that support // embedded image data? groupState.pictDepth = 1; embObjHandler.startPict(); } else if (equals("line")) { if (!ignored) { addOutputChar('\n'); } } else if (equals("column")) { if (!ignored) { addOutputChar(' '); } } else if (equals("page")) { if (!ignored) { addOutputChar('\n'); } } else if (equals("softline")) { if (!ignored) { addOutputChar('\n'); } } else if (equals("softcolumn")) { if (!ignored) { addOutputChar(' '); } } else if (equals("softpage")) { if (!ignored) { addOutputChar('\n'); } } else if (equals("tab")) { if (!ignored) { addOutputChar('\t'); } } else if (equals("upr")) { uprState = 0; } else if (equals("ud") && uprState == 1) { uprState = -1; // 2nd group inside the upr destination, which // contains the unicode encoding of the text, so // we want to keep that: groupState.ignore = false; } else if (equals("bullet")) { if (!ignored) { // unicode BULLET addOutputChar('\u2022'); } } else if (equals("endash")) { if (!ignored) { // unicode EN DASH addOutputChar('\u2013'); } } else if (equals("emdash")) { if (!ignored) { // unicode EM DASH addOutputChar('\u2014'); } } else if (equals("enspace")) { if (!ignored) { // unicode EN SPACE addOutputChar('\u2002'); } } else if (equals("qmspace")) { if (!ignored) { // quarter em space -> unicode FOUR-PER-EM SPACE addOutputChar('\u2005'); } } else if (equals("emspace")) { if (!ignored) { // unicode EM SPACE addOutputChar('\u2003'); } } else if (equals("lquote")) { if (!ignored) { // unicode LEFT SINGLE QUOTATION MARK addOutputChar('\u2018'); } } else if (equals("rquote")) { if (!ignored) { // unicode RIGHT SINGLE QUOTATION MARK addOutputChar('\u2019'); } } else if (equals("ldblquote")) { if (!ignored) { // unicode LEFT DOUBLE QUOTATION MARK addOutputChar('\u201C'); } } else if (equals("rdblquote")) { if (!ignored) { // unicode RIGHT DOUBLE QUOTATION MARK addOutputChar('\u201D'); } } else if (equals("fldinst")) { fieldState = 1; groupState.ignore = false; } else if (equals("fldrslt") && fieldState == 2) { assert pendingURL != null; lazyStartParagraph(); out.startElement("a", "href", pendingURL); pendingURL = null; fieldState = 3; groupState.ignore = false; } } // Push new GroupState private void processGroupStart(PushbackInputStream in) throws IOException { ansiSkip = 0; // Push current groupState onto the stack groupStates.add(groupState); // Make new GroupState groupState = new GroupState(groupState); assert groupStates.size() == groupState.depth : "size=" + groupStates.size() + " depth=" + groupState.depth; if (uprState == 0) { uprState = 1; groupState.ignore = true; } // Check for ignorable groups. Note that // sometimes we un-ignore within this group, eg // when handling upr escape. int b2 = in.read(); if (b2 == '\\') { int b3 = in.read(); if (b3 == '*') { groupState.ignore = true; } in.unread(b3); } in.unread(b2); } // Pop current GroupState private void processGroupEnd() throws IOException, SAXException, TikaException { if (inHeader) { if (nextMetaData != null) { if (nextMetaData == TikaCoreProperties.CREATED) { Calendar cal = Calendar.getInstance(TimeZone.getDefault(), Locale.ROOT); cal.set(year, month - 1, day, hour, minute, 0); metadata.set(nextMetaData, cal.getTime()); } else if (nextMetaData.isMultiValuePermitted()) { metadata.add(nextMetaData, pendingBuffer.toString()); } else { metadata.set(nextMetaData, pendingBuffer.toString()); } nextMetaData = null; } pendingBuffer.setLength(0); } assert groupState.depth > 0; ansiSkip = 0; if (groupState.objdata == true) { try { embObjHandler.handleCompletedObject(); } catch (TikaException|IOException e) { EmbeddedDocumentUtil.recordException(e, metadata); } groupState.objdata = false; } else if (groupState.pictDepth > 0) { if (groupState.sn == true) { embObjHandler.endSN(); } else if (groupState.sv == true) { embObjHandler.endSV(); } else if (groupState.sp == true) { embObjHandler.endSP(); } else if (groupState.pictDepth == 1) { embObjHandler.handleCompletedObject(); } } if (groupState.annotation == true) { addOutputChar(SPACE); } if (groupState.object == true) { embObjHandler.setInObject(false); } // Be robust if RTF doc is corrupt (has too many // closing }s): // TODO: log a warning? if (groupStates.size() > 0) { // Restore group state: final GroupState outerGroupState = groupStates.removeLast(); // Close italic, if outer does not have italic or // bold changed: if (groupState.italic) { if (!outerGroupState.italic || groupState.bold != outerGroupState.bold) { end("i"); groupState.italic = false; } } // Close bold if (groupState.bold && !outerGroupState.bold) { end("b"); } // Open bold if (!groupState.bold && outerGroupState.bold) { start("b"); } // Open italic if (!groupState.italic && outerGroupState.italic) { start("i"); } groupState = outerGroupState; } assert groupStates.size() == groupState.depth; if (fieldState == 1) { String s = pendingBuffer.toString().trim(); pendingBuffer.setLength(0); if (s.startsWith("HYPERLINK")) { s = s.substring(9).trim(); // TODO: what other instructions can be in a // HYPERLINK destination? final boolean isLocalLink = s.contains("\\l "); int idx = s.indexOf('"'); if (idx != -1) { int idx2 = s.indexOf('"', 1 + idx); if (idx2 != -1) { s = s.substring(1 + idx, idx2); } } pendingURL = (isLocalLink ? "#" : "") + s; fieldState = 2; } else { fieldState = 0; } // TODO: we could process the other known field // types. Right now, we will extract their text // inlined, but fail to record them in metadata // as a field value. } else if (fieldState == 3) { end("a"); fieldState = 0; } } }