All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.sax.StandardsExtractingContentHandler Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.sax;

import java.util.Arrays;
import java.util.List;

import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.metadata.Metadata;

/**
 * StandardsExtractingContentHandler is a Content Handler used to extract
 * standard references while parsing.
 * 

* This handler relies on complex regular expressions which can be slow on some types of * input data. */ public class StandardsExtractingContentHandler extends ContentHandlerDecorator { public static final String STANDARD_REFERENCES = "standard_references"; private final Metadata metadata; private final StringBuilder stringBuilder; private int maxBufferLength = 100000; private double threshold = 0; /** * Creates a decorator for the given SAX event handler and Metadata object. * * @param handler SAX event handler to be decorated. * @param metadata {@link Metadata} object. */ public StandardsExtractingContentHandler(ContentHandler handler, Metadata metadata) { super(handler); this.metadata = metadata; this.stringBuilder = new StringBuilder(); } /** * Creates a decorator that by default forwards incoming SAX events to a * dummy content handler that simply ignores all the events. Subclasses * should use the {@link #setContentHandler(ContentHandler)} method to * switch to a more usable underlying content handler. Also creates a dummy * Metadata object to store phone numbers in. */ protected StandardsExtractingContentHandler() { this(new DefaultHandler(), new Metadata()); } /** * Gets the threshold to be used for selecting the standard references found * within the text based on their score. * * @return the threshold to be used for selecting the standard references * found within the text based on their score. */ public double getThreshold() { return threshold; } /** * Sets the score to be used as threshold. * * @param score the score to be used as threshold. */ public void setThreshold(double score) { this.threshold = score; } /** * The characters method is called whenever a Parser wants to pass raw * characters to the ContentHandler. However, standard references are often * split across different calls to characters, depending on the specific * Parser used. Therefore, we simply add all characters to a StringBuilder * and analyze it once the document is finished. */ @Override public void characters(char[] ch, int start, int length) throws SAXException { try { if (maxBufferLength > -1) { int remaining = maxBufferLength - stringBuilder.length(); length = remaining > length ? length : remaining; } String text = new String(Arrays.copyOfRange(ch, start, start + length)); stringBuilder.append(text); super.characters(ch, start, length); } catch (SAXException e) { handleException(e); } } /** * This method is called whenever the Parser is done parsing the file. So, * we check the output for any standard references. */ @Override public void endDocument() throws SAXException { super.endDocument(); List standards = StandardsText.extractStandardReferences(stringBuilder.toString(), threshold); for (StandardReference standardReference : standards) { metadata.add(STANDARD_REFERENCES, standardReference.toString()); } } /** * The number of characters to store in memory for checking for standards. * * If this is unbounded, the complex regular expressions can take a long time * to process some types of data. Only increase this limit with great caution. */ public void setMaxBufferLength(int maxBufferLength) { this.maxBufferLength = maxBufferLength; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy