All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.sax.BasicContentHandlerFactory Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
package org.apache.tika.sax;
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Locale;

import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Basic factory for creating common types of ContentHandlers
 */
public class BasicContentHandlerFactory implements ContentHandlerFactory {

    /**
     * Tries to parse string into handler type.  Returns default if string is null or
     * parse fails.
     * 

* Options: xml, html, text, body, ignore (no content) * * @param handlerTypeName string to parse * @param defaultType type to return if parse fails * @return handler type */ public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE defaultType) { if (handlerTypeName == null) { return defaultType; } String lcHandlerTypeName = handlerTypeName.toLowerCase(Locale.ROOT); switch (lcHandlerTypeName) { case "xml" : return HANDLER_TYPE.XML; case "text" : return HANDLER_TYPE.TEXT; case "txt" : return HANDLER_TYPE.TEXT; case "html" : return HANDLER_TYPE.HTML; case "body" : return HANDLER_TYPE.BODY; case "ignore" : return HANDLER_TYPE.IGNORE; default : return defaultType; } } /** * Common handler types for content. */ public enum HANDLER_TYPE { BODY, IGNORE, //don't store content TEXT, HTML, XML } private final HANDLER_TYPE type; private final int writeLimit; /** * * @param type basic type of handler * @param writeLimit max number of characters to store; if < 0, the handler will store all characters */ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit) { this.type = type; this.writeLimit = writeLimit; } @Override public ContentHandler getNewContentHandler() { if (type == HANDLER_TYPE.BODY) { return new BodyContentHandler(writeLimit); } else if (type == HANDLER_TYPE.IGNORE) { return new DefaultHandler(); } if (writeLimit > -1) { switch(type) { case TEXT: return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit); case HTML: return new WriteOutContentHandler(new ToHTMLContentHandler(), writeLimit); case XML: return new WriteOutContentHandler(new ToXMLContentHandler(), writeLimit); default: return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit); } } else { switch (type) { case TEXT: return new ToTextContentHandler(); case HTML: return new ToHTMLContentHandler(); case XML: return new ToXMLContentHandler(); default: return new ToTextContentHandler(); } } } @Override public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException { return getNewContentHandler(os, Charset.forName(encoding)); } @Override public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { if (type == HANDLER_TYPE.IGNORE) { return new DefaultHandler(); } try { if (writeLimit > -1) { switch (type) { case BODY: return new WriteOutContentHandler( new BodyContentHandler( new OutputStreamWriter(os, charset)), writeLimit); case TEXT: return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit); case HTML: return new WriteOutContentHandler(new ToHTMLContentHandler(os, charset.name()), writeLimit); case XML: return new WriteOutContentHandler(new ToXMLContentHandler(os, charset.name()), writeLimit); default: return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit); } } else { switch (type) { case BODY: return new BodyContentHandler(new OutputStreamWriter(os, charset)); case TEXT: return new ToTextContentHandler(os, charset.name()); case HTML: return new ToHTMLContentHandler(os, charset.name()); case XML: return new ToXMLContentHandler(os, charset.name()); default: return new ToTextContentHandler(os, charset.name()); } } } catch (UnsupportedEncodingException e) { throw new RuntimeException("couldn't find charset for name: "+charset); } } /** * * @return handler type used by this factory */ public HANDLER_TYPE getType() { return type; } public int getWriteLimit() { return writeLimit; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy