All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.anserini.collection.Iso19115Collection Maven / Gradle / Ivy

/*
 * Anserini: A Lucene toolkit for reproducible information retrieval research
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.anserini.collection;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.MappingIterator;
import com.fasterxml.jackson.databind.ObjectMapper;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.NoSuchElementException;

public class Iso19115Collection extends DocumentCollection {
  public Iso19115Collection(Path path) {
    this.path = path;
    this.allowedFileSuffix = new HashSet<>(Arrays.asList(".json", ".jsonl"));
  }

  public Iso19115Collection() {
  }

  @Override
  public FileSegment createFileSegment(Path p) throws IOException {
    return new Segment(p);
  }

  @Override
  public FileSegment createFileSegment(BufferedReader bufferedReader) throws IOException {
    return new Segment(bufferedReader);
  }

  public static class Segment extends FileSegment {
    private JsonNode node = null;
    private Iterator iter = null;
    private MappingIterator iterator;

    public Segment(Path path) throws IOException {
      super(path);
      bufferedReader = new BufferedReader(new FileReader(path.toString()));
      ObjectMapper mapper = new ObjectMapper();
      iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader);
      if (iterator.hasNext()) {
        node = iterator.next();
        if (node.isArray()) {
          iter = node.elements();
        }
      }
    }

    public Segment(BufferedReader bufferedReader) throws IOException {
      super(bufferedReader);
      ObjectMapper mapper = new ObjectMapper();
      iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader);
      if (iterator.hasNext()) {
        node = iterator.next();
        if (node.isArray()) {
          iter = node.elements();
        }
      }
    }

    @Override
    public void readNext() throws NoSuchElementException {
      if (node == null) {
        throw new NoSuchElementException("JsonNode is empty");
      } else if (node.isObject()) {
        bufferedRecord = new Iso19115Collection.Document(node);
        if (iterator.hasNext()) {
          node = iterator.next();
        } else {
          atEOF = true;
        }
      } else if (node.isArray()) {
        if (iter != null && iter.hasNext()) {
          JsonNode json = iter.next();
          bufferedRecord = new Iso19115Collection.Document(node);
        } else {
          throw new NoSuchElementException("Reached end of JsonNode iterator");
        }
      } else {
        throw new NoSuchElementException("Invalid JsonNode type");
      } 
    }
  }

  public static class Document implements SourceDocument{
    protected String id;
    protected String title;
    static final String[] titlePath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:citation", "gmd:CI_Citation", "gmd:title",
                                      "gco:CharacterString"};
    protected String abstractContent;
    static final String[] abstractContentPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:abstract", "gco:CharacterString"};
    protected String raw;
    protected String organisation;
    static final String[] organisationPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:abstract", "gco:CharacterString"};
    protected String[] responsibleParty;
    static final String[] responsiblePartyPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:citation", "gmd:CI_Citation", "gmd:citedResponsibleParty"};
    protected String catalogue;
    static final String[] cataloguePath = {"gmd:MD_Metadata", "gmd:contact", "gmd:CI_ResponsibleParty", "gmd:individualName", "gco:CharacterString"};
    protected String publish_time;
    static final String[] publish_timePath = {"gmd:MD_Metadata", "gmd:dateStamp", "gco:Date"};
    protected String url;
    static final String[] urlPath = {"gmd:MD_Metadata", "gmd:dataSetURI", "gco:CharacterString"};
    protected double[] latitude;
    protected double[] longitude;
    protected String coordinates;
    static final String[] coordinatePath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:extent", "gmd:EX_Extent", "gmd:geographicElement",
                                            "gmd:EX_GeographicBoundingBox"};
    protected String purpose;
    static final String[] purposePath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:purpose", "gco:CharacterString"};
    protected String supplInfo;
    static final String[] supplInfoPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:supplementalInformation", "gco:CharacterString"};
    protected String topicCategory;
    static final String[] topicCategoryPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:topicCategory", "gmd:MD_TopicCategoryCode"};
    protected String[] keywords;
    static final String[] keywordPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:descriptiveKeywords"};
    protected String recommendedCitation;
    static final String[] recommendedCitationPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:citation", "gmd:CI_Citation",
                                                    "gmd:otherCitationDetails", "gco:CharacterString"};
    protected String thesaurusName;
    static final String[] theasurusNameMainPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:descriptiveKeywords"};
    static final String[] theasurusNameSubPath = {"gmd:MD_Keywords", "gmd:thesaurusName", "gmd:CI_Citation"};

    public Document(JsonNode json) {
      // extracting the fields from the ISO19115 file
      this.raw = json.toString();
      String identifier = json.get("gmd:MD_Metadata").get("gmd:fileIdentifier").get("gco:CharacterString").asText();
      // extracting the id in the beginning of the text
      this.id = identifier.substring(0,identifier.length() - 8);
  
      this.title = extractNode(titlePath, json).asText();
      this.abstractContent = extractNode(abstractContentPath, json).asText();
      this.organisation = extractNode(organisationPath, json).asText();
      this.catalogue = extractNode(cataloguePath, json).asText();
      this.publish_time = extractNode(publish_timePath, json).asText();
      this.url = extractNode(urlPath, json).asText();
      this.purpose = extractNode(purposePath, json).asText();
      this.supplInfo = extractNode(supplInfoPath, json).asText();
      this.topicCategory = extractNode(topicCategoryPath, json).asText();
      this.recommendedCitation = extractNode(recommendedCitationPath, json).asText();

      JsonNode mainThesaurusNode = extractNode(theasurusNameMainPath, json).get(0);
      this.thesaurusName = extractNode(theasurusNameSubPath, mainThesaurusNode).get("gmd:title").get("gco:CharacterString").asText()
                           + " : " +
                           extractNode(theasurusNameSubPath, mainThesaurusNode).get("gmd:otherCitationDetails").get("gco:CharacterString").asText();

      // extracting all the responsible parties of the paper
      JsonNode parties_node = extractNode(responsiblePartyPath, json);
      // extracting individual parties from the ResponsibleParty field
      int number_of_parties = parties_node.size();
      responsibleParty = new String[number_of_parties];
      for(int i=0; i < number_of_parties; i++){
        responsibleParty[i] = parties_node.get(i).get("gmd:CI_ResponsibleParty").get("gmd:individualName").get("gco:CharacterString").asText();
      }

      // extracting all the keywords of the paper
      JsonNode keyword_node = extractNode(keywordPath, json).get(0).get("gmd:MD_Keywords").get("gmd:keyword");
      // extracting individual keyword from the keyword field
      int number_of_keywords = keyword_node.size();
      keywords = new String[number_of_keywords];
      for(int i=0; i < number_of_keywords; i++){
        keywords[i] = keyword_node.get(i).get("gco:CharacterString").asText();
      }


      // extracting the latitudes from the paper, 5 points as the polygon needs to be enclosed
      latitude = new double[4];
      JsonNode coordinateNode = extractNode(coordinatePath, json);
      latitude[0] = coordinateNode.get("gmd:northBoundLatitude").get("gco:Decimal").asDouble();
      latitude[2] = coordinateNode.get("gmd:southBoundLatitude").get("gco:Decimal").asDouble();
      // ensuring that a single coordinate location will be drawn as a small rectangle
      if (latitude[0] == latitude[2]) {
        latitude[0] -= 0.01;
        latitude[2] += 0.01;
      }
      latitude[1] = latitude[0];
      latitude[3] = latitude[2];

      // extracting the longitudes from the paper, again 5 points are needed to enclose the polygon
      longitude = new double[4];
      longitude[0] = coordinateNode.get("gmd:westBoundLongitude").get("gco:Decimal").asDouble();
      longitude[1] = coordinateNode.get("gmd:eastBoundLongitude").get("gco:Decimal").asDouble();
      // ensuring that a single coordinate location will be drawn as a small rectangle
      if (longitude[0] == longitude[1]) {
        longitude[0] -= 0.01;
        longitude[1] += 0.01;
      }
      longitude[2] = longitude[1];
      longitude[3] = longitude[0];

      this.coordinates = getCoordinateString();
    }

    private JsonNode extractNode(String[] nodeNames, JsonNode json) {
      for (String node: nodeNames) {
        json = json.get(node);
      }
      return json;
    }

    public String getTitle() {
      return title;
    }

    public String getAbstract() {
      return abstractContent;
    }

    public String getOrganisation() {
      return organisation;
    }

    public String[] getResponsibleParty() {
      return responsibleParty;
    }

    public String getCatalogue() {
      return catalogue;
    }

    public String getPublish_time() {
      return publish_time;
    }

    public String getUrl() {
      return url;
    }

    public String getCoordinates() {
      return coordinates;
    }

    public String getSupplInfo() {
      return supplInfo;
    }

    public String getTopicCategory() {
      return topicCategory;
    }

    public String[] getKeywords() {
      return keywords;
    }

    public String getRecommendedCitation() {
      return recommendedCitation;
    }

    public String getThesaurusName() {
      return thesaurusName;
    }

    public String getPurpose() {return purpose;}

    private String getCoordinateString() {
      StringBuilder coordinates = new StringBuilder("[");
      // generating it in this form for literal evaluation in javascript
      for(int i=0; i < 4; i++) {
        coordinates.append("[");
        coordinates.append(latitude[i]);
        coordinates.append(",");
        coordinates.append(longitude[i]);
        coordinates.append("]");
        if (i != 3) {
          coordinates.append(",");
        }
      }
      coordinates.append("]");
      return coordinates.toString();
    }

    @Override
    public String id() {
      return id;
    }

    @Override
    public String contents() {
      return title + "\n" + abstractContent;
    }

    @Override
    public String raw() {
      return raw;
    }

    @Override
    public boolean indexable() {
      return true;
    }

  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy