org.apache.tika.parser.wordperfect.QPWTextExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.wordperfect;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import com.google.common.base.Strings;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.UnsupportedFormatException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.QuattroPro;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.XHTMLContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
/**
* Extracts text from a Quattro Pro document according to QPW v9 File Format.
* This format appears to be compatible with more recent versions too.
* @author Pascal Essiembre
*/
class QPWTextExtractor {
private static final Logger LOG = LoggerFactory.getLogger(QPWTextExtractor.class);
private static final String OLE_DOCUMENT_NAME = "NativeContent_MAIN";
private enum Extractor {
IGNORE { @Override public void extract(Context ctx) throws IOException {
ctx.in.skipWPByte(ctx.bodyLength);
}},
BOF { @Override public void extract(Context ctx) throws IOException {
ctx.metadata.set(QuattroPro.ID, ctx.in.readWPString(4));
ctx.metadata.set(QuattroPro.VERSION, ctx.in.readWPShort());
ctx.metadata.set(QuattroPro.BUILD, ctx.in.readWPShort());
ctx.in.readWPShort(); // Last saved bits
ctx.metadata.set(QuattroPro.LOWEST_VERSION, ctx.in.readWPShort());
ctx.metadata.set(Office.PAGE_COUNT, ctx.in.readWPShort());
ctx.in.skipWPByte(ctx.bodyLength - 14);
}},
USER { @Override public void extract(Context ctx) throws IOException {
ctx.metadata.set(TikaCoreProperties.CREATOR, getQstrLabel(ctx.in));
ctx.metadata.set(TikaCoreProperties.MODIFIER, getQstrLabel(ctx.in));
}},
EXT_LINK { @Override public void extract(Context ctx)
throws IOException, SAXException {
ctx.in.readWPShort(); // index
ctx.in.readWPShort(); // page first
ctx.in.readWPShort(); // page last
ctx.xhtml.characters(getQstrLabel(ctx.in));
ctx.xhtml.characters(System.lineSeparator());
}},
STRING_TABLE { @Override public void extract(Context ctx)
throws IOException, SAXException {
long entries = ctx.in.readWPLong();
ctx.in.readWPLong(); // Total used
ctx.in.readWPLong(); // Total saved
for (int i = 0; i < entries; i++) {
ctx.xhtml.characters(getQstrLabel(ctx.in));
ctx.xhtml.characters(System.lineSeparator());
}
}},
BOS { @Override public void extract(Context ctx)
throws IOException, SAXException {
ctx.in.readWPShort(); // sheet #
ctx.in.readWPShort(); // first col index
ctx.in.readWPShort(); // last col index
ctx.in.readWPLong(); // first row index
ctx.in.readWPLong(); // last row index
ctx.in.readWPShort(); // format
ctx.in.readWPShort(); // flags
ctx.xhtml.characters(getQstrLabel(ctx.in));
ctx.xhtml.characters(System.lineSeparator());
}},
SHEET_HEADFOOT { @Override public void extract(Context ctx)
throws IOException, SAXException {
ctx.in.readWPShort(); // flag
ctx.xhtml.characters(getQstrLabel(ctx.in));
ctx.xhtml.characters(System.lineSeparator());
}},
FORMULA_STRING_VALUE { @Override public void extract(Context ctx)
throws IOException, SAXException {
ctx.in.readWPShort(); // column
ctx.in.readWPLong(); // row
ctx.xhtml.characters(getQstrLabel(ctx.in));
}},
CGENERICLABEL { @Override public void extract(Context ctx)
throws IOException, SAXException {
ctx.in.readWPShort(); // column
ctx.in.readWPLong(); // row
ctx.in.readWPShort(); // format index
ctx.xhtml.characters(getQstrLabel(ctx.in));
}},
CCOMMENT { @Override public void extract(Context ctx)
throws IOException, SAXException {
ctx.in.readWPShort(); // column
ctx.in.readWPLong(); // row
ctx.in.readWPLong(); // flag
ctx.xhtml.characters(getQstrLabel(ctx.in)); // author name
ctx.xhtml.characters(getQstrLabel(ctx.in)); // comment
}},
// Use to print out a chunk
DEBUG { @Override public void extract(Context ctx) throws IOException {
LOG.error("REC ({}/{}):{}", Integer.toHexString(ctx.type), ctx.bodyLength, ctx.in.readWPString(ctx.bodyLength));
}};
public abstract void extract(Context ctx)
throws IOException, SAXException;
}
// Holds extractors for each record types we are interested in.
// All record types not defined here will be skipped.
private static final Map EXTRACTORS =
new HashMap<>();
static {
//--- Global Records ---
EXTRACTORS.put(0x0001, Extractor.BOF); // Beginning of file
EXTRACTORS.put(0x0005, Extractor.USER); // User
//--- Notebook Records ---
EXTRACTORS.put(0x0403, Extractor.EXT_LINK);// External link
EXTRACTORS.put(0x0407, Extractor.STRING_TABLE); // String table
//--- Sheet Records ---
EXTRACTORS.put(0x0601, Extractor.BOS); // Beginning of sheet
EXTRACTORS.put(0x0605, Extractor.SHEET_HEADFOOT); // Sheet header
EXTRACTORS.put(0x0606, Extractor.SHEET_HEADFOOT); // Sheet footer
//--- Cells ---
EXTRACTORS.put(0x0c02, Extractor.FORMULA_STRING_VALUE);
EXTRACTORS.put(0x0c72, Extractor.CGENERICLABEL);
EXTRACTORS.put(0x0c80, Extractor.CCOMMENT);
}
class Context {
private final WPInputStream in;
private final XHTMLContentHandler xhtml;
private final Metadata metadata;
private int type;
private int bodyLength;
public Context(WPInputStream in, XHTMLContentHandler xhtml,
Metadata metadata) {
super();
this.in = in;
this.xhtml = xhtml;
this.metadata = metadata;
}
}
@SuppressWarnings("resource")
public void extract(
InputStream input, XHTMLContentHandler xhtml, Metadata metadata)
throws IOException, SAXException, TikaException {
POIFSFileSystem pfs = new POIFSFileSystem(input);
DirectoryNode rootNode = pfs.getRoot();
if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) {
throw new UnsupportedFormatException("Unsupported QuattroPro file format. "
+ "Looking for OLE entry \"" + OLE_DOCUMENT_NAME
+ "\". Found: " + (rootNode == null ? "null" : rootNode.getEntryNames()));
}
//TODO shall we validate and throw warning/error if the file does not
//start with a BOF and ends with a EOF?
xhtml.startElement("p");
try (WPInputStream in = new WPInputStream(
pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) {
Context ctx = new Context(in, xhtml, metadata);
while (hasNext(in)) {
ctx.type = in.readWPShort();
ctx.bodyLength = in.readWPShort();
Extractor extractor = EXTRACTORS.get(ctx.type);
if (extractor != null) {
extractor.extract(ctx);
} else {
// Use DEBUG to find out what we are ignoring
// Extractor.DEBUG.extract(ctx);
Extractor.IGNORE.extract(ctx);
}
}
}
xhtml.endElement("p");
}
private boolean hasNext(InputStream in) throws IOException {
try {
in.mark(1);
return in.read() != -1;
} finally {
in.reset();
}
}
private static String getQstrLabel(WPInputStream in) throws IOException {
// QSTR
int count = in.readWPShort();
in.readWPByte(); // string type
char[] text = new char[count+1];
text[0] = in.readWPChar();
// QSTRLABEL
for (int i = 0; i < count; i++) {
text[i+1] = in.readWPChar();
}
return new String(text);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy