org.apache.tika.parser.strings.Latin1StringsParser Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.strings;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.Set;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Parser to extract printable Latin1 strings from arbitrary files with pure java
* without running any external process. Useful for binary or unknown files, for
* files without a specific parser and for corrupted ones causing a TikaException
* as a fallback parser. To enable the parsing of unknown or files without a
* specific parser with AutoDetectParser:
*
* AutoDetectParser parser = new AutoDetectParser();
* parser.setFallback(new Latin1StringsParser());
*
* Currently the parser does a best effort to extract Latin1 strings, used by
* Western European languages, encoded with ISO-8859-1, UTF-8 or UTF-16 charsets
* mixed within the same file.
*
* The implementation is optimized for fast parsing with only one pass.
*/
public class Latin1StringsParser extends AbstractParser {
private static final long serialVersionUID = 1L;
/**
* The set of supported types
*/
private static final Set SUPPORTED_TYPES = getTypes();
/**
* The valid ISO-8859-1 character map.
*/
private static final boolean[] isChar = getCharMap();
/**
* The size of the internal buffers.
*/
private static int BUF_SIZE = 64 * 1024;
/**
* The minimum size of a character sequence to be extracted.
*/
private int minSize = 4;
/**
* The output buffer.
*/
private byte[] output = new byte[BUF_SIZE];
/**
* The input buffer.
*/
private byte[] input = new byte[BUF_SIZE];
/**
* The temporary position into the output buffer.
*/
private int tmpPos = 0;
/**
* The current position into the output buffer.
*/
private int outPos = 0;
/**
* The number of bytes into the input buffer.
*/
private int inSize = 0;
/**
* The position into the input buffer.
*/
private int inPos = 0;
/**
* The output content handler.
*/
private XHTMLContentHandler xhtml;
/**
* Returns the minimum size of a character sequence to be extracted.
*
* @return the minimum size of a character sequence
*/
public int getMinSize() {
return minSize;
}
/**
* Sets the minimum size of a character sequence to be extracted.
*
* @param minSize
* the minimum size of a character sequence
*/
public void setMinSize(int minSize) {
this.minSize = minSize;
}
/**
* Populates the valid ISO-8859-1 character map.
*
* @return the valid ISO-8859-1 character map.
*/
private static boolean[] getCharMap() {
boolean[] isChar = new boolean[256];
for (int c = Byte.MIN_VALUE; c <= Byte.MAX_VALUE; c++)
if ((c >= 0x20 && c <= 0x7E)
|| (c >= (byte) 0xC0 && c <= (byte) 0xFE) || c == 0x0A
|| c == 0x0D || c == 0x09) {
isChar[c & 0xFF] = true;
}
return isChar;
}
/**
* Returns the set of supported types.
*
* @return the set of supported types
*/
private static Set getTypes() {
HashSet supportedTypes = new HashSet();
supportedTypes.add(MediaType.OCTET_STREAM);
return supportedTypes;
}
/**
* Tests if the byte is a ISO-8859-1 char.
*
* @param c
* the byte to test.
*
* @return if the byte is a char.
*/
private static final boolean isChar(byte c) {
return isChar[c & 0xFF];
}
/**
* Flushes the internal output buffer to the content handler.
*
* @throws UnsupportedEncodingException
* @throws SAXException
*/
private void flushBuffer() throws UnsupportedEncodingException,
SAXException {
if (tmpPos - outPos >= minSize)
outPos = tmpPos - minSize;
xhtml.characters(new String(output, 0, outPos, "windows-1252"));
for (int k = 0; k < tmpPos - outPos; k++)
output[k] = output[outPos + k];
tmpPos = tmpPos - outPos;
outPos = 0;
}
@Override
public Set getSupportedTypes(ParseContext arg0) {
return SUPPORTED_TYPES;
}
/**
* @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
* org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
* org.apache.tika.parser.ParseContext)
*/
@Override
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException {
/*
* Creates a new instance because the object is not immutable.
*/
new Latin1StringsParser().doParse(stream, handler, metadata, context);
}
/**
* Does a best effort to extract Latin1 strings encoded with ISO-8859-1,
* UTF-8 or UTF-16. Valid chars are saved into the output buffer and the
* temporary buffer position is incremented. When an invalid char is read,
* the difference of the temporary and current buffer position is checked.
* If it is greater than the minimum string size, the current buffer
* position is updated to the temp position. If it is not, the temp position
* is reseted to the current position.
*
* @param stream
* the input stream.
* @param handler
* the output content handler
* @param metadata
* the metadata of the file
* @param context
* the parsing context
* @throws IOException
* if an io error occurs
* @throws SAXException
* if a sax error occurs
*/
private void doParse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException {
tmpPos = 0;
outPos = 0;
xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
int i = 0;
do {
inSize = 0;
while ((i = stream.read(input, inSize, BUF_SIZE - inSize)) > 0) {
inSize += i;
}
inPos = 0;
while (inPos < inSize) {
byte c = input[inPos++];
boolean utf8 = false;
/*
* Test for a possible UTF8 encoded char
*/
if (c == (byte) 0xC3) {
byte c_ = inPos < inSize ? input[inPos++] : (byte) stream
.read();
/*
* Test if the next byte is in the valid UTF8 range
*/
if (c_ >= (byte) 0x80 && c_ <= (byte) 0xBF) {
utf8 = true;
output[tmpPos++] = (byte) (c_ + 0x40);
} else {
output[tmpPos++] = c;
c = c_;
}
if (tmpPos == BUF_SIZE)
flushBuffer();
/*
* Test for a possible UTF8 encoded char
*/
} else if (c == (byte) 0xC2) {
byte c_ = inPos < inSize ? input[inPos++] : (byte) stream
.read();
/*
* Test if the next byte is in the valid UTF8 range
*/
if (c_ >= (byte) 0xA0 && c_ <= (byte) 0xBF) {
utf8 = true;
output[tmpPos++] = c_;
} else {
output[tmpPos++] = c;
c = c_;
}
if (tmpPos == BUF_SIZE)
flushBuffer();
}
if (!utf8)
/*
* Test if the byte is a valid char.
*/
if (isChar(c)) {
output[tmpPos++] = c;
if (tmpPos == BUF_SIZE)
flushBuffer();
} else {
/*
* Test if the byte is an invalid char, marking a string
* end. If it is a zero, test 2 positions before or
* ahead for a valid char, meaning it marks the
* transition between ISO-8859-1 and UTF16 sequences.
*/
if (c != 0
|| (inPos >= 3 && isChar(input[inPos - 3]))
|| (inPos + 1 < inSize && isChar(input[inPos + 1]))) {
if (tmpPos - outPos >= minSize) {
output[tmpPos++] = 0x0A;
outPos = tmpPos;
if (tmpPos == BUF_SIZE)
flushBuffer();
} else
tmpPos = outPos;
}
}
}
} while (i != -1 && !Thread.currentThread().isInterrupted());
if (tmpPos - outPos >= minSize) {
output[tmpPos++] = 0x0A;
outPos = tmpPos;
}
xhtml.characters(new String(output, 0, outPos, "windows-1252"));
xhtml.endDocument();
}
}