All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.csv.TextAndCSVParser Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.csv;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.config.Field;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * Unless the {@link TikaCoreProperties#CONTENT_TYPE_OVERRIDE} is set,
 * this parser tries to assess whether the file is a text file, csv or tsv.
 * If the detector detects regularity in column numbers and/or encapsulated cells,
 * this parser will apply the {@link org.apache.commons.csv.CSVParser};
 * otherwise, it will treat the contents as text.
 * 

* If there is a csv parse exception during detection, the parser sets * the {@link Metadata#CONTENT_TYPE} to {@link MediaType#TEXT_PLAIN} * and treats the file as {@link MediaType#TEXT_PLAIN}. *

*

* If there is a csv parse exception during the parse, the parser * writes what's left of the stream as if it were text and then throws * an exception. As of this writing, the content that was buffered by the underlying * {@link org.apache.commons.csv.CSVParser} is lost. *

*/ public class TextAndCSVParser extends AbstractEncodingDetectorParser { private static final String CSV_PREFIX = "csv"; private static final String CHARSET = "charset"; private static final String DELIMITER = "delimiter"; public static final Property DELIMITER_PROPERTY = Property.externalText( CSV_PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER+DELIMITER); private static final String TD = "td"; private static final String TR = "tr"; private static final String TABLE = "table"; static final MediaType CSV = MediaType.text("csv"); static final MediaType TSV = MediaType.text("tsv"); private static final int DEFAULT_MARK_LIMIT = 20000; private static final char[] DEFAULT_DELIMITERS = new char[]{',', '\t'}; private static final Map CHAR_TO_STRING_DELIMITER_MAP = new HashMap<>(); private static final Map STRING_TO_CHAR_DELIMITER_MAP = new HashMap<>(); static { CHAR_TO_STRING_DELIMITER_MAP.put(',', "comma"); CHAR_TO_STRING_DELIMITER_MAP.put('\t', "tab"); CHAR_TO_STRING_DELIMITER_MAP.put('|', "pipe"); CHAR_TO_STRING_DELIMITER_MAP.put(';', "semicolon"); CHAR_TO_STRING_DELIMITER_MAP.put(':', "colon"); } static { for (Map.Entry e : CHAR_TO_STRING_DELIMITER_MAP.entrySet()) { STRING_TO_CHAR_DELIMITER_MAP.put(e.getValue(), e.getKey()); } } private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet(Arrays.asList( CSV, TSV, MediaType.TEXT_PLAIN))); private char[] delimiters = DEFAULT_DELIMITERS; /** * This is the mark limit in characters (not bytes) to * read from the stream when classifying the stream as * csv, tsv or txt. */ @Field private int markLimit = DEFAULT_MARK_LIMIT; /** * minimum confidence score that there's enough * evidence to determine csv/tsv vs. txt */ @Field private double minConfidence = 0.50; public TextAndCSVParser() { super(); } public TextAndCSVParser(EncodingDetector encodingDetector) { super(encodingDetector); } @Override public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { CSVParams params = getOverride(metadata); Reader reader = null; Charset charset = null; if (! params.isComplete()) { reader = detect(params, stream, metadata, context); if (params.getCharset() != null) { charset = params.getCharset(); } else { charset = ((AutoDetectReader) reader).getCharset(); } } else { reader = new BufferedReader(new InputStreamReader(stream, params.getCharset())); charset = params.getCharset(); } updateMetadata(params, metadata); //if text or a non-csv/tsv category of text //treat this as text and be done //TODO -- if it was detected as a non-csv subtype of text if (! params.getMediaType().getBaseType().equals(CSV) && ! params.getMediaType().getBaseType().equals(TSV)) { handleText(reader, charset, handler, metadata); return; } CSVFormat csvFormat = CSVFormat.EXCEL.withDelimiter(params.getDelimiter()); metadata.set(DELIMITER_PROPERTY, CHAR_TO_STRING_DELIMITER_MAP.get(csvFormat.getDelimiter())); XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata); try (org.apache.commons.csv.CSVParser commonsParser = new org.apache.commons.csv.CSVParser(reader, csvFormat)) { xhtmlContentHandler.startDocument(); xhtmlContentHandler.startElement(TABLE); try { for (CSVRecord row : commonsParser) { xhtmlContentHandler.startElement(TR); for (String cell : row) { xhtmlContentHandler.startElement(TD); xhtmlContentHandler.characters(cell); xhtmlContentHandler.endElement(TD); } xhtmlContentHandler.endElement(TR); } } catch (IllegalStateException e) { //if there's a parse exception //try to get the rest of the content...treat it as text for now //There will be some content lost because of buffering. //TODO -- figure out how to improve this xhtmlContentHandler.endElement(TABLE); xhtmlContentHandler.startElement("div", "name", "after exception"); handleText(reader, xhtmlContentHandler); xhtmlContentHandler.endElement("div"); xhtmlContentHandler.endDocument(); //TODO -- consider dumping what's left in the reader as text throw new TikaException("exception parsing the csv", e); } xhtmlContentHandler.endElement(TABLE); xhtmlContentHandler.endDocument(); } } private void handleText(Reader reader, Charset charset, ContentHandler handler, Metadata metadata) throws SAXException, IOException, TikaException { // Automatically detect the character encoding //try to get detected content type; could be a subclass of text/plain //such as vcal, etc. String incomingMime = metadata.get(Metadata.CONTENT_TYPE); MediaType mediaType = MediaType.TEXT_PLAIN; if (incomingMime != null) { MediaType tmpMediaType = MediaType.parse(incomingMime); if (tmpMediaType != null) { mediaType = tmpMediaType; } } MediaType type = new MediaType(mediaType, charset); metadata.set(Metadata.CONTENT_TYPE, type.toString()); // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, charset.name()); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); handleText(reader, xhtml); xhtml.endDocument(); } private static void handleText(Reader reader, XHTMLContentHandler xhtml) throws SAXException, IOException { xhtml.startElement("p"); char[] buffer = new char[4096]; int n = reader.read(buffer); while (n != -1) { xhtml.characters(buffer, 0, n); n = reader.read(buffer); } xhtml.endElement("p"); } private Reader detect(CSVParams params, InputStream stream, Metadata metadata, ParseContext context) throws IOException, TikaException { //if the file was already identified as not .txt, .csv or .tsv //don't even try to csv or not String mediaString = metadata.get(Metadata.CONTENT_TYPE); if (mediaString != null) { MediaType mediaType = MediaType.parse(mediaString); if (! SUPPORTED_TYPES.contains(mediaType.getBaseType())) { params.setMediaType(mediaType); return new AutoDetectReader( new CloseShieldInputStream(stream), metadata, getEncodingDetector(context)); } } Reader reader = null; if (params.getCharset() == null) { reader = new AutoDetectReader( new CloseShieldInputStream(stream), metadata, getEncodingDetector(context)); params.setCharset(((AutoDetectReader)reader).getCharset()); if (params.isComplete()) { return reader; } } else { reader = new BufferedReader(new InputStreamReader( new CloseShieldInputStream(stream), params.getCharset())); } if (params.getDelimiter() == null && (params.getMediaType() == null || isCSVOrTSV(params.getMediaType()))) { CSVSniffer sniffer = new CSVSniffer(markLimit, delimiters, minConfidence); CSVResult result = sniffer.getBest(reader, metadata); params.setMediaType(result.getMediaType()); params.setDelimiter(result.getDelimiter()); } return reader; } private CSVParams getOverride(Metadata metadata) { String override = metadata.get(TikaCoreProperties.CONTENT_TYPE_OVERRIDE); if (override == null) { return new CSVParams(); } MediaType mediaType = MediaType.parse(override); if (mediaType == null) { return new CSVParams(); } String charsetString = mediaType.getParameters().get(CHARSET); Charset charset = null; if (charsetString != null) { try { charset = Charset.forName(charsetString); } catch (UnsupportedCharsetException e) { } } if (! isCSVOrTSV(mediaType)) { return new CSVParams(mediaType, charset); } String delimiterString = mediaType.getParameters().get(DELIMITER); if (delimiterString == null) { return new CSVParams(mediaType, charset); } if (STRING_TO_CHAR_DELIMITER_MAP.containsKey(delimiterString)) { return new CSVParams(mediaType, charset, (char) STRING_TO_CHAR_DELIMITER_MAP.get(delimiterString)); } if (delimiterString.length() == 1) { return new CSVParams(mediaType, charset, delimiterString.charAt(0)); } //TODO: log bad/unrecognized delimiter string return new CSVParams(mediaType, charset); } static boolean isCSVOrTSV(MediaType mediaType) { if (mediaType == null) { return false; } if (mediaType.getBaseType().equals(TSV) || mediaType.getBaseType().equals(CSV)) { return true; } return false; } private void updateMetadata(CSVParams params, Metadata metadata) { MediaType mediaType = null; if (params.getMediaType().getBaseType().equals(MediaType.TEXT_PLAIN)) { mediaType = MediaType.TEXT_PLAIN; } else if (params.getDelimiter() != null) { if (params.getDelimiter() == '\t') { mediaType = TSV; } else { mediaType = CSV; } } else { if (metadata.get(Metadata.CONTENT_TYPE) != null) { mediaType = MediaType.parse( metadata.get(Metadata.CONTENT_TYPE)); } } Map attrs = new HashMap<>(); if (params.getCharset() != null) { attrs.put(CHARSET, params.getCharset().name()); // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, params.getCharset().name()); } if (!mediaType.equals(MediaType.TEXT_PLAIN) && params.getDelimiter() != null) { if (CHAR_TO_STRING_DELIMITER_MAP.containsKey(params.getDelimiter())) { attrs.put(DELIMITER, CHAR_TO_STRING_DELIMITER_MAP.get(params.getDelimiter())); } else { attrs.put(DELIMITER, Integer.toString((int)params.getDelimiter().charValue())); } } MediaType type = new MediaType(mediaType, attrs); metadata.set(Metadata.CONTENT_TYPE, type.toString()); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy