All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.sas.SAS7BDATParser Maven / Gradle / Ivy

There is a newer version: 3.0.0-BETA2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.sas;

import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.Set;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Database;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.executable.MachineMetadata;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import com.epam.parso.Column;
import com.epam.parso.DataWriterUtil;
import com.epam.parso.SasFileProperties;
import com.epam.parso.SasFileReader;
import com.epam.parso.impl.SasFileReaderImpl;

/**
 * Processes the SAS7BDAT data columnar database file used by SAS and 
 *  other similar languages.
 */
public class SAS7BDATParser extends AbstractParser {
    private static final long serialVersionUID = -2775485539937983150L;
    
    private static final MediaType TYPE_SAS7BDAT =
            MediaType.application("x-sas-data");
    private static final Set SUPPORTED_TYPES =
            Collections.singleton(TYPE_SAS7BDAT);

    @Override
    public Set getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    @Override
    public void parse(InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        metadata.set(Metadata.CONTENT_TYPE, TYPE_SAS7BDAT.toString());

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        
        SasFileReader sas = new SasFileReaderImpl(stream);
        SasFileProperties props = sas.getSasFileProperties();

        // Record the interesting parts of the file's metadata
        metadata.set(TikaCoreProperties.TITLE, props.getName());
        metadata.set(TikaCoreProperties.CREATED, props.getDateCreated());
        metadata.set(TikaCoreProperties.MODIFIED, props.getDateModified());

        metadata.set(PagedText.N_PAGES,     (int)props.getPageCount());
        metadata.set(Database.COLUMN_COUNT, (int)props.getColumnsCount());
        metadata.set(Database.ROW_COUNT,    (int)props.getRowCount());

        // TODO Can we find more general properties for these / move
        //  these to more general places?
        metadata.set(HttpHeaders.CONTENT_ENCODING, props.getEncoding());
        metadata.set(OfficeOpenXMLExtended.APPLICATION, props.getServerType());
        metadata.set(OfficeOpenXMLExtended.APP_VERSION, props.getSasRelease());
        metadata.set(MachineMetadata.ARCHITECTURE_BITS, 
                     props.isU64() ? "64" : "32");
        metadata.set(MachineMetadata.ENDIAN, props.getEndianness() == 1 ? 
                     MachineMetadata.Endian.LITTLE.getName() : 
                     MachineMetadata.Endian.BIG.getName());

        // The following SAS Metadata fields are currently ignored:
        // compressionMethod
        // sessionEncoding
        // fileType
        // osName - 
        // osType - 
        // mixPageRowCount
        // headerLength
        // pageLength
        // rowLength

        // Process the column metadata
        // TODO Find keys to record the format and the type
        for (Column c : sas.getColumns()) {
            String name = c.getLabel();
            if (name == null || name.isEmpty()) name = c.getName();
            metadata.add(Database.COLUMN_NAME, name);
        }


        // Output file contents as a table
        xhtml.element("h1", props.getName());
        xhtml.startElement("table");
        xhtml.newline();
        
        // Do the column headings
        xhtml.startElement("tr");
        for (Column c : sas.getColumns()) {
            String label = c.getLabel();
            if (label == null || label.isEmpty()) label = c.getName();

            xhtml.startElement("th", "title", c.getName());
            xhtml.characters(label);
            xhtml.endElement("th");
        }
        xhtml.endElement("tr");
        xhtml.newline();
        
        // Process each row in turn
        Object[] row = null;
        while ((row = sas.readNext()) != null) {
            xhtml.startElement("tr");
            for (String val : DataWriterUtil.getRowValues(sas.getColumns(), row)) {
                // Use explicit start/end, rather than element, to 
                //  ensure that empty cells still get output
                xhtml.startElement("td");
                xhtml.characters(val);
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
            xhtml.newline();
        }

        // Finish
        xhtml.endElement("table");
        xhtml.endDocument();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy