All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.server.resource.TikaResource Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.server.resource;

import static java.nio.charset.StandardCharsets.UTF_8;

import javax.mail.internet.ContentDisposition;
import javax.mail.internet.ParseException;
import javax.ws.rs.Consumes;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.PUT;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.WebApplicationException;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.HttpHeaders;
import javax.ws.rs.core.MultivaluedMap;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.StreamingOutput;
import javax.ws.rs.core.UriInfo;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.lang.reflect.Field;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.RichTextContentHandler;
import org.apache.tika.server.InputStreamFactory;
import org.apache.tika.server.TikaServerParseException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

@Path("/tika")
public class TikaResource {
    public static final String GREETING = "This is Tika Server ("+new Tika().toString()+"). Please PUT\n";
    public static final String X_TIKA_OCR_HEADER_PREFIX = "X-Tika-OCR";
    public static final String X_TIKA_PDF_HEADER_PREFIX = "X-Tika-PDF";


    private static final Log logger = LogFactory.getLog(TikaResource.class);

    private static TikaConfig tikaConfig;
    private static DigestingParser.Digester digester = null;
    private static InputStreamFactory inputStreamFactory = null;

    public static void init(TikaConfig config, DigestingParser.Digester digestr,
                            InputStreamFactory iSF) {
        tikaConfig = config;
        digester = digestr;
        inputStreamFactory = iSF;
    }

    static {
        ExtractorFactory.setAllThreadsPreferEventExtractors(true);
    }

    @SuppressWarnings("serial")
    public static Parser createParser() {
        final Parser parser = new AutoDetectParser(tikaConfig);

        Map parsers = ((AutoDetectParser)parser).getParsers();
        parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
        ((AutoDetectParser)parser).setParsers(parsers);

        ((AutoDetectParser)parser).setFallback(new Parser() {
            public Set getSupportedTypes(ParseContext parseContext) {
                return parser.getSupportedTypes(parseContext);
            }

            public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) {
                throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
            }
        });
        if (digester != null) {
            return new DigestingParser(parser, digester);
        }
        return parser;
    }

    public static TikaConfig getConfig() {
        return tikaConfig;
    }

    public static String detectFilename(MultivaluedMap httpHeaders) {

        String disposition = httpHeaders.getFirst("Content-Disposition");
        if (disposition != null) {
            try {
                ContentDisposition c = new ContentDisposition(disposition);

                // only support "attachment" dispositions
                if ("attachment".equals(c.getDisposition())) {
                    String fn = c.getParameter("filename");
                    if (fn != null) {
                        return fn;
                    }
                }
            } catch (ParseException e) {
                // not a valid content-disposition field
            	e.printStackTrace();
            	logger.warn(String.format(
                        Locale.ROOT,
                        "Parse exception %s determining content disposition",
                        e.getMessage()
                ), e);
            }
        }

        // this really should not be used, since it's not an official field
        return httpHeaders.getFirst("File-Name");
    }

    public static void fillParseContext(ParseContext parseContext, MultivaluedMap httpHeaders,
                                        Parser embeddedParser) {
        TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
        PDFParserConfig pdfParserConfig = new PDFParserConfig();
        for (String key : httpHeaders.keySet()) {
            if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
                processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX);
            } else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
                processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX);
            }
        }
        parseContext.set(TesseractOCRConfig.class, ocrConfig);
        parseContext.set(PDFParserConfig.class, pdfParserConfig);
        if (embeddedParser != null) {
            parseContext.set(Parser.class, embeddedParser);
        }
    }

    public static InputStream getInputStream(InputStream is, HttpHeaders headers) {
        try {
            return inputStreamFactory.getInputSteam(is, headers);
        } catch (IOException e) {
            throw new TikaServerParseException(e);
        }
    }

    /**
     * Utility method to set a property on a class via reflection.
     *
     * @param httpHeaders the HTTP headers set.
     * @param object      the Object to set the property on.
     * @param key         the key of the HTTP Header.
     * @param prefix      the name of the HTTP Header prefix used to find property.
     * @throws WebApplicationException thrown when field cannot be found.
     */
    private static void processHeaderConfig(MultivaluedMap httpHeaders, Object object, String key, String prefix) {
        try {
            String property = StringUtils.removeStart(key, prefix);
            Field field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property));
            field.setAccessible(true);
            if (field.getType() == String.class) {
                field.set(object, httpHeaders.getFirst(key));
            } else if (field.getType() == int.class) {
                field.setInt(object, Integer.parseInt(httpHeaders.getFirst(key)));
            } else if (field.getType() == double.class) {
                field.setDouble(object, Double.parseDouble(httpHeaders.getFirst(key)));
            } else if (field.getType() == boolean.class) {
                field.setBoolean(object, Boolean.parseBoolean(httpHeaders.getFirst(key)));
            }
        } catch (Throwable ex) {
            throw new WebApplicationException(String.format(Locale.ROOT,
                    "%s is an invalid %s header", key, X_TIKA_OCR_HEADER_PREFIX));
        }
    }

    @SuppressWarnings("serial")
    public static void fillMetadata(Parser parser, Metadata metadata, ParseContext context, MultivaluedMap httpHeaders) {
        String fileName = detectFilename(httpHeaders);
        if (fileName != null) {
            metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
        }

        String contentTypeHeader = httpHeaders.getFirst(HttpHeaders.CONTENT_TYPE);
        javax.ws.rs.core.MediaType mediaType = contentTypeHeader == null ? null
                : javax.ws.rs.core.MediaType.valueOf(contentTypeHeader);
        if (mediaType != null && "xml".equals(mediaType.getSubtype())) {
            mediaType = null;
        }

        if (mediaType != null && mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) {
            mediaType = null;
        }

        if (mediaType != null) {
            metadata.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, mediaType.toString());

            final Detector detector = getDetector(parser);

            setDetector(parser, new Detector() {
                public MediaType detect(InputStream inputStream, Metadata metadata) throws IOException {
                    String ct = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
                    //make sure never to return null -- TIKA-1845
                    MediaType type = null;
                    if (ct != null) {
                        //this can return null if ct is not a valid mime type
                        type = MediaType.parse(ct);
                    }
                    if (type != null) {
                        return type;
                    } else {
                        return detector.detect(inputStream, metadata);
                    }
                }
            });
        }

        final String password = httpHeaders.getFirst("Password");
        if (password != null) {
            context.set(PasswordProvider.class, new PasswordProvider() {
                @Override
                public String getPassword(Metadata metadata) {
                    return password;
                }
            });
        }
    }

    public static void setDetector(Parser p, Detector detector) {
        AutoDetectParser adp = getAutoDetectParser(p);
        adp.setDetector(detector);
    }

    public static Detector getDetector(Parser p) {
        AutoDetectParser adp = getAutoDetectParser(p);
        return adp.getDetector();
    }

    private static AutoDetectParser getAutoDetectParser(Parser p) {
        //bit stinky
        if (p instanceof AutoDetectParser) {
            return (AutoDetectParser)p;
        } else if (p instanceof ParserDecorator) {
            Parser wrapped = ((ParserDecorator)p).getWrappedParser();
            if (wrapped instanceof AutoDetectParser) {
                return (AutoDetectParser)wrapped;
            }
            throw new RuntimeException("Couldn't find AutoDetectParser within: "+wrapped.getClass());

        }
        throw new RuntimeException("Couldn't find AutoDetectParser within: "+p.getClass());

    }

    public static void parse(Parser parser, Log logger, String path, InputStream inputStream,
                             ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException {
        try (TikaInputStream tikaInputStream = TikaInputStream.get(inputStream)) {
            parser.parse(tikaInputStream, handler, metadata, parseContext);
        } catch (SAXException e) {
            throw new TikaServerParseException(e);
        } catch (EncryptedDocumentException e) {
            logger.warn(String.format(
                    Locale.ROOT,
                    "%s: Encrypted document",
                    path
            ), e);
            throw new TikaServerParseException(e);
        } catch (Exception e) {
            logger.warn(String.format(
                    Locale.ROOT,
                    "%s: Text extraction failed",
                    path
            ), e);
            throw new TikaServerParseException(e);
        }
    }

    public static void logRequest(Log logger, UriInfo info, Metadata metadata) {
        if (metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE) == null) {
            logger.info(String.format(
                    Locale.ROOT,
                    "%s (autodetecting type)",
                    info.getPath()
            ));
        } else {
            logger.info(String.format(
                    Locale.ROOT,
                    "%s (%s)",
                    info.getPath(),
                    metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE)
            ));
        }
    }

    @GET
    @Produces("text/plain")
    public String getMessage() {
        return GREETING;
    }

    @POST
    @Consumes("multipart/form-data")
    @Produces("text/plain")
    @Path("form")
    public StreamingOutput getTextFromMultipart(Attachment att, @Context final UriInfo info) {
        return produceText(att.getObject(InputStream.class), att.getHeaders(), info);
    }

    @PUT
    @Consumes("*/*")
    @Produces("text/plain")
    public StreamingOutput getText(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
        return produceText(getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info);
    }

    public StreamingOutput produceText(final InputStream is, MultivaluedMap httpHeaders, final UriInfo info) {
        final Parser parser = createParser();
        final Metadata metadata = new Metadata();
        final ParseContext context = new ParseContext();

        fillMetadata(parser, metadata, context, httpHeaders);
        fillParseContext(context, httpHeaders, parser);

        logRequest(logger, info, metadata);

        return new StreamingOutput() {
            public void write(OutputStream outputStream) throws IOException, WebApplicationException {
                Writer writer = new OutputStreamWriter(outputStream, UTF_8);

                BodyContentHandler body = new BodyContentHandler(new RichTextContentHandler(writer));

                try (InputStream inputStream = is) {
                    parse(parser, logger, info.getPath(), inputStream, body, metadata, context);
                }
            }
        };
    }

    @POST
    @Consumes("multipart/form-data")
    @Produces("text/html")
    @Path("form")
    public StreamingOutput getHTMLFromMultipart(Attachment att, @Context final UriInfo info) {
        return produceOutput(att.getObject(InputStream.class), att.getHeaders(), info, "html");
    }

    @PUT
    @Consumes("*/*")
    @Produces("text/html")
    public StreamingOutput getHTML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
        return produceOutput(getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info, "html");
    }

    @POST
    @Consumes("multipart/form-data")
    @Produces("text/xml")
    @Path("form")
    public StreamingOutput getXMLFromMultipart(Attachment att, @Context final UriInfo info) {
        return produceOutput(att.getObject(InputStream.class), att.getHeaders(), info, "xml");
    }

    @PUT
    @Consumes("*/*")
    @Produces("text/xml")
    public StreamingOutput getXML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
        return produceOutput(getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info, "xml");
    }

    private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap httpHeaders,
                                          final UriInfo info, final String format) {
        final Parser parser = createParser();
        final Metadata metadata = new Metadata();
        final ParseContext context = new ParseContext();

        fillMetadata(parser, metadata, context, httpHeaders);
        fillParseContext(context, httpHeaders, parser);


        logRequest(logger, info, metadata);

        return new StreamingOutput() {
            public void write(OutputStream outputStream)
                    throws IOException, WebApplicationException {
                Writer writer = new OutputStreamWriter(outputStream, UTF_8);
                ContentHandler content;

                try {
                    SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
                    TransformerHandler handler = factory.newTransformerHandler();
                    handler.getTransformer().setOutputProperty(OutputKeys.METHOD, format);
                    handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
                    handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, UTF_8.name());
                    handler.setResult(new StreamResult(writer));
                    content = new ExpandedTitleContentHandler(handler);
                } catch (TransformerConfigurationException e) {
                    throw new WebApplicationException(e);
                }

                parse(parser, logger, info.getPath(), is, content, metadata, context);
            }
        };
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy