All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.geoinfo.GeographicInformationParser Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.parser.geoinfo;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.sis.internal.util.CheckedArrayList;
import org.apache.sis.internal.util.CheckedHashSet;
import org.apache.sis.metadata.iso.DefaultMetadata;
import org.apache.sis.metadata.iso.DefaultMetadataScope;
import org.apache.sis.metadata.iso.constraint.DefaultLegalConstraints;
import org.apache.sis.metadata.iso.extent.DefaultGeographicBoundingBox;
import org.apache.sis.metadata.iso.extent.DefaultGeographicDescription;
import org.apache.sis.metadata.iso.identification.DefaultDataIdentification;
import org.apache.sis.storage.DataStore;
import org.apache.sis.storage.DataStoreException;
import org.apache.sis.storage.DataStores;
import org.apache.sis.storage.UnsupportedStorageException;
import org.apache.sis.util.collection.CodeListSet;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.opengis.metadata.Identifier;
import org.opengis.metadata.citation.Citation;
import org.opengis.metadata.citation.CitationDate;
import org.opengis.metadata.citation.OnlineResource;
import org.opengis.metadata.citation.ResponsibleParty;
import org.opengis.metadata.constraint.Restriction;
import org.opengis.metadata.distribution.DigitalTransferOptions;
import org.opengis.metadata.distribution.Distribution;
import org.opengis.metadata.distribution.Distributor;
import org.opengis.metadata.distribution.Format;
import org.opengis.metadata.extent.Extent;
import org.opengis.metadata.extent.GeographicExtent;
import org.opengis.metadata.identification.Identification;
import org.opengis.metadata.identification.Keywords;
import org.opengis.metadata.identification.Progress;
import org.opengis.metadata.identification.TopicCategory;
import org.opengis.util.InternationalString;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;


public class GeographicInformationParser extends AbstractParser{

    private static final Logger LOG = LoggerFactory.getLogger(GeographicInformationParser.class);


    public static final String geoInfoType="text/iso19139+xml";
    private final Set SUPPORTED_TYPES =
            Collections.singleton(MediaType.text("iso19139+xml"));


    @Override
    public Set getSupportedTypes(ParseContext parseContext) {
        return SUPPORTED_TYPES;
    }

    @Override
    public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        metadata.set(Metadata.CONTENT_TYPE,geoInfoType);
        DataStore dataStore= null;
        DefaultMetadata defaultMetadata=null;
        XHTMLContentHandler xhtmlContentHandler=new XHTMLContentHandler(contentHandler,metadata);

        TemporaryResources tmp = TikaInputStream.isTikaInputStream(inputStream) ? null
                : new TemporaryResources();
        try {
            TikaInputStream tikaInputStream = TikaInputStream.get(inputStream,tmp);
            File file= tikaInputStream.getFile();
            dataStore = DataStores.open(file);
            defaultMetadata=new DefaultMetadata(dataStore.getMetadata());
            if(defaultMetadata!=null)
                extract(xhtmlContentHandler, metadata, defaultMetadata);

        } catch (UnsupportedStorageException e) {
            throw new TikaException("UnsupportedStorageException",e);
        } catch (DataStoreException e) {
            throw new TikaException("DataStoreException", e);
        } finally {
            if (tmp != null) {
                tmp.dispose();
            }
        }
    }

    private void extract(XHTMLContentHandler xhtmlContentHandler, Metadata metadata,
                         DefaultMetadata defaultMetadata) throws SAXException {
        getMetaDataCharacterSet(metadata, defaultMetadata);
        getMetaDataContact(metadata, defaultMetadata);
        getMetaDataIdentificationInfo(metadata, defaultMetadata);
        getMetaDataDistributionInfo(metadata, defaultMetadata);
        getMetaDataDateInfo(metadata, defaultMetadata);
        getMetaDataResourceScope(metadata, defaultMetadata);
        getMetaDataParentMetaDataTitle(metadata, defaultMetadata);
        getMetaDataIdetifierCode(metadata, defaultMetadata);
        getMetaDataStandard(metadata, defaultMetadata);
        extractContent(xhtmlContentHandler, defaultMetadata);
    }

    private void extractContent(XHTMLContentHandler xhtmlContentHandler, DefaultMetadata defaultMetadata) throws SAXException{
        xhtmlContentHandler.startDocument();
        xhtmlContentHandler.newline();

        xhtmlContentHandler.newline();
        ArrayList identifications= (ArrayList) defaultMetadata.getIdentificationInfo();
        for(Identification i:identifications) {
            xhtmlContentHandler.startElement("h1");
            xhtmlContentHandler.characters(i.getCitation().getTitle().toString());
            xhtmlContentHandler.endElement("h1");
            xhtmlContentHandler.newline();

            ArrayList responsiblePartyArrayList = (ArrayList) i.getCitation().getCitedResponsibleParties();
            for (ResponsibleParty r : responsiblePartyArrayList) {
                xhtmlContentHandler.startElement("h3");
                xhtmlContentHandler.newline();
                xhtmlContentHandler.characters("CitedResponsiblePartyRole " + r.getRole().toString());
                xhtmlContentHandler.characters("CitedResponsiblePartyName " + r.getIndividualName().toString());
                xhtmlContentHandler.endElement("h3");
                xhtmlContentHandler.newline();
            }

            xhtmlContentHandler.startElement("p");
            xhtmlContentHandler.newline();
            xhtmlContentHandler.characters("IdentificationInfoAbstract " + i.getAbstract().toString());
            xhtmlContentHandler.endElement("p");
            xhtmlContentHandler.newline();
            Collection extentList=((DefaultDataIdentification) i).getExtents();
            for(Extent e:extentList){
                ArrayList geoElements= (ArrayList) e.getGeographicElements();
                for(GeographicExtent g:geoElements) {

                    if (g instanceof DefaultGeographicBoundingBox) {
                        xhtmlContentHandler.startElement("tr");
                        xhtmlContentHandler.startElement("td");
                        xhtmlContentHandler.characters("GeographicElementWestBoundLatitude");
                        xhtmlContentHandler.endElement("td");
                        xhtmlContentHandler.startElement("td");
                        xhtmlContentHandler.characters(String.valueOf(((DefaultGeographicBoundingBox) g).getWestBoundLongitude()));
                        xhtmlContentHandler.endElement("td");
                        xhtmlContentHandler.endElement("tr");
                        xhtmlContentHandler.startElement("tr");
                        xhtmlContentHandler.startElement("td");
                        xhtmlContentHandler.characters("GeographicElementEastBoundLatitude");
                        xhtmlContentHandler.endElement("td");
                        xhtmlContentHandler.startElement("td");
                        xhtmlContentHandler.characters(String.valueOf(((DefaultGeographicBoundingBox) g).getEastBoundLongitude()));
                        xhtmlContentHandler.endElement("td");
                        xhtmlContentHandler.endElement("tr");
                        xhtmlContentHandler.startElement("tr");
                        xhtmlContentHandler.startElement("td");
                        xhtmlContentHandler.characters("GeographicElementNorthBoundLatitude");
                        xhtmlContentHandler.endElement("td");
                        xhtmlContentHandler.startElement("td");
                        xhtmlContentHandler.characters(String.valueOf(((DefaultGeographicBoundingBox) g).getNorthBoundLatitude()));
                        xhtmlContentHandler.endElement("td");
                        xhtmlContentHandler.endElement("tr");
                        xhtmlContentHandler.startElement("tr");
                        xhtmlContentHandler.startElement("td");
                        xhtmlContentHandler.characters("GeographicElementSouthBoundLatitude");
                        xhtmlContentHandler.endElement("td");
                        xhtmlContentHandler.startElement("td");
                        xhtmlContentHandler.characters(String.valueOf(((DefaultGeographicBoundingBox) g).getSouthBoundLatitude()));
                        xhtmlContentHandler.endElement("td");
                        xhtmlContentHandler.endElement("tr");
                    }
                }
            }
        }
        xhtmlContentHandler.newline();
        xhtmlContentHandler.endDocument();
    }

    private void getMetaDataCharacterSet(Metadata metadata, DefaultMetadata defaultMetaData){
        Map charsetMap = defaultMetaData.getLocalesAndCharsets();
        for(Charset c : charsetMap.values()){
            metadata.add("CharacterSet",c.name());
        }
    }


    private void getMetaDataContact(Metadata metadata, DefaultMetadata defaultMetaData){
        CheckedArrayList contactSet= (CheckedArrayList) defaultMetaData.getContacts();
        for(ResponsibleParty rparty:contactSet){
           if(rparty.getRole()!=null)
                metadata.add("ContactRole",rparty.getRole().name());
           if(rparty.getOrganisationName()!=null)
                metadata.add("ContactPartyName-",rparty.getOrganisationName().toString());
        }
    }

    private void getMetaDataIdentificationInfo(Metadata metadata, DefaultMetadata defaultMetaData){
        ArrayList identifications= (ArrayList) defaultMetaData.getIdentificationInfo();
        for(Identification i:identifications){
            DefaultDataIdentification defaultDataIdentification= (DefaultDataIdentification) i;
            if(i.getCitation()!=null && i.getCitation().getTitle()!=null)
                metadata.add("IdentificationInfoCitationTitle ",i.getCitation().getTitle().toString());

            ArrayList dateArrayList= (ArrayList) i.getCitation().getDates();
            for (CitationDate d:dateArrayList){
                if(d.getDateType()!=null)
                    metadata.add("CitationDate ",d.getDateType().name()+"-->"+d.getDate());
            }
            ArrayList responsiblePartyArrayList= (ArrayList) i.getCitation().getCitedResponsibleParties();
            for(ResponsibleParty r:responsiblePartyArrayList){
                if(r.getRole()!=null)
                    metadata.add("CitedResponsiblePartyRole ",r.getRole().toString());
                if(r.getIndividualName()!=null)
                    metadata.add("CitedResponsiblePartyName ",r.getIndividualName().toString());
                if(r.getOrganisationName()!=null)
                    metadata.add("CitedResponsiblePartyOrganizationName ", r.getOrganisationName().toString());
                if(r.getPositionName()!=null)
                    metadata.add("CitedResponsiblePartyPositionName ",r.getPositionName().toString());

                if(r.getContactInfo()!=null){
                    for(String s:r.getContactInfo().getAddress().getElectronicMailAddresses()) {
                        metadata.add("CitedResponsiblePartyEMail ",s.toString());
                    }
                }
            }
            if(i.getAbstract()!=null)
                metadata.add("IdentificationInfoAbstract ",i.getAbstract().toString());
            for(Progress p:i.getStatus()) {
                metadata.add("IdentificationInfoStatus ",p.name());
            }
            ArrayList formatArrayList= (ArrayList) i.getResourceFormats();
            for(Format f:formatArrayList){
                if(f.getName()!=null)
                    metadata.add("ResourceFormatSpecificationAlternativeTitle ",f.getName().toString());
            }
            Map localeCharsetMap = defaultDataIdentification.getLocalesAndCharsets();
            for(Locale l : localeCharsetMap.keySet()) {
                metadata.add("IdentificationInfoLanguage-->",l.getDisplayLanguage(Locale.ENGLISH));
            }
            CodeListSet categoryList= (CodeListSet) defaultDataIdentification.getTopicCategories();
            for(TopicCategory t:categoryList){
                metadata.add("IdentificationInfoTopicCategory-->",t.name());
            }
            ArrayList keywordList= (ArrayList) i.getDescriptiveKeywords();
            int j=1;
            for(Keywords k:keywordList){
                j++;
                ArrayList stringList= (ArrayList) k.getKeywords();
                for(InternationalString s:stringList){
                    metadata.add("Keywords "+j ,s.toString());
                }
                if(k.getType()!=null)
                    metadata.add("KeywordsType "+j,k.getType().name());
                if(k.getThesaurusName()!=null && k.getThesaurusName().getTitle()!=null)
                    metadata.add("ThesaurusNameTitle "+j,k.getThesaurusName().getTitle().toString());
                if(k.getThesaurusName()!=null && k.getThesaurusName().getAlternateTitles()!=null)
                    metadata.add("ThesaurusNameAlternativeTitle "+j,k.getThesaurusName().getAlternateTitles().toString());

                ArrayListcitationDates= (ArrayList) k.getThesaurusName().getDates();
                for(CitationDate cd:citationDates) {
                   if(cd.getDateType()!=null)
                        metadata.add("ThesaurusNameDate ",cd.getDateType().name() +"-->" + cd.getDate());
                }
            }
            ArrayList constraintList= (ArrayList) i.getResourceConstraints();

            for(DefaultLegalConstraints c:constraintList){
                for(Restriction r:c.getAccessConstraints()){
                    metadata.add("AccessContraints ",r.name());
                }
                for(InternationalString s:c.getOtherConstraints()){
                    metadata.add("OtherConstraints ",s.toString());
                }
                for(Restriction r:c.getUseConstraints()) {
                    metadata.add("UserConstraints ",r.name());
                }
              
            }
            Collection extentList=((DefaultDataIdentification) i).getExtents();
            for(Extent e:extentList){
                ArrayList geoElements= (ArrayList) e.getGeographicElements();
                for(GeographicExtent g:geoElements){

                    if(g instanceof DefaultGeographicDescription){
                        if(((DefaultGeographicDescription) g).getGeographicIdentifier()!=null && ((DefaultGeographicDescription) g).getGeographicIdentifier().getCode()!=null )
                            metadata.add("GeographicIdentifierCode ",((DefaultGeographicDescription) g).getGeographicIdentifier().getCode().toString());
                        if(((DefaultGeographicDescription) g).getGeographicIdentifier()!=null && ((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority()!=null && ((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority().getTitle()!=null )
                        metadata.add("GeographicIdentifierAuthorityTitle ",((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority().getTitle().toString());

                        for(InternationalString s:((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority().getAlternateTitles()) {
                            metadata.add("GeographicIdentifierAuthorityAlternativeTitle ",s.toString());
                        }
                        for(CitationDate cd:((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority().getDates()){
                            if(cd.getDateType()!=null && cd.getDate()!=null)
                                metadata.add("GeographicIdentifierAuthorityDate ",cd.getDateType().name()+" "+cd.getDate().toString());
                        }
                    }
                }
            }
        }
    }

    private void getMetaDataDistributionInfo(Metadata metadata, DefaultMetadata defaultMetaData){
        Distribution distribution=defaultMetaData.getDistributionInfo();
        ArrayList distributionFormat= (ArrayList) distribution.getDistributionFormats();
        for(Format f:distributionFormat){
            if(f.getName()!=null)
                metadata.add("DistributionFormatSpecificationAlternativeTitle ",f.getName().toString());
        }
        ArrayList distributorList= (ArrayList) distribution.getDistributors();
        for(Distributor d:distributorList){
            if(d!=null && d.getDistributorContact()!=null && d.getDistributorContact().getRole()!=null)
                metadata.add("Distributor Contact ",d.getDistributorContact().getRole().name());
            if(d!=null && d.getDistributorContact()!=null && d.getDistributorContact().getOrganisationName()!=null)
                metadata.add("Distributor Organization Name ",d.getDistributorContact().getOrganisationName().toString());
        }
        ArrayList transferOptionsList= (ArrayList) distribution.getTransferOptions();
        for(DigitalTransferOptions d:transferOptionsList){
            ArrayList onlineResourceList= (ArrayList) d.getOnLines();
            for(OnlineResource or:onlineResourceList){
                if(or.getLinkage()!=null)
                    metadata.add("TransferOptionsOnlineLinkage ",or.getLinkage().toString());
                if(or.getProtocol()!=null)
                    metadata.add("TransferOptionsOnlineProtocol ",or.getProtocol());
                if(or.getApplicationProfile()!=null)
                    metadata.add("TransferOptionsOnlineProfile ",or.getApplicationProfile());
                if(or.getName()!=null)
                    metadata.add("TransferOptionsOnlineName ",or.getName());
                if(or.getDescription()!=null)
                    metadata.add("TransferOptionsOnlineDescription ",or.getDescription().toString());
                if(or.getFunction()!=null)
                    metadata.add("TransferOptionsOnlineFunction ",or.getFunction().name());

            }
        }
    }

    private void getMetaDataDateInfo(Metadata metadata, DefaultMetadata defaultMetaData){
        ArrayList citationDateList= (ArrayList) defaultMetaData.getDateInfo();
        for(CitationDate c:citationDateList){
            if(c.getDateType()!=null)
                metadata.add("DateInfo ",c.getDateType().name()+" "+c.getDate());
        }
    }

    private void getMetaDataResourceScope(Metadata metadata, DefaultMetadata defaultMetaData){
        ArrayList scopeList= (ArrayList) defaultMetaData.getMetadataScopes();
        for(DefaultMetadataScope d:scopeList){
            if(d.getResourceScope()!=null)
                metadata.add("MetaDataResourceScope ",d.getResourceScope().name());
        }
    }

    private void getMetaDataParentMetaDataTitle(Metadata metadata, DefaultMetadata defaultMetaData){
        Citation parentMetaData=defaultMetaData.getParentMetadata();
        if(parentMetaData!=null && parentMetaData.getTitle()!=null)
            metadata.add("ParentMetaDataTitle",parentMetaData.getTitle().toString());
    }

    private void getMetaDataIdetifierCode(Metadata metadata, DefaultMetadata defaultMetaData){
        Identifier identifier= defaultMetaData.getMetadataIdentifier();
        if(identifier!=null)
            metadata.add("MetaDataIdentifierCode",identifier.getCode());
    }

    private void getMetaDataStandard(Metadata metadata, DefaultMetadata defaultMetaData){
        ArrayList citationList= (ArrayList) defaultMetaData.getMetadataStandards();
        for(Citation c:citationList){
            if(c.getTitle()!=null)
                metadata.add("MetaDataStandardTitle ",c.getTitle().toString());
            if(c.getEdition()!=null)
                metadata.add("MetaDataStandardEdition ",c.getEdition().toString());
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy