All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.datexis.sector.reader.WikiSectionReader Maven / Gradle / Ivy

There is a newer version: 1.3.3
Show newest version
package de.datexis.sector.reader;

import de.datexis.common.ObjectSerializer;
import de.datexis.common.Resource;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.reader.DatasetReader;
import de.datexis.sector.model.WikiDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * Reads WikiSection Datasets from JSON file.
 * @author Sebastian Arnold 
 */
public class WikiSectionReader implements DatasetReader {

  protected final static Logger log = LoggerFactory.getLogger(WikiSectionReader.class);
  
  @Override
  public Dataset read(Resource path) throws IOException {
    return readDatasetFromJSON(path);
  }
  
  public static Dataset readDatasetFromJSON(Resource path) throws IOException {
    log.info("Reading Wiki Articles from {}", path.toString());
    Dataset result = new Dataset(path.getFileName().replace(".json", ""));
    Iterator it = ObjectSerializer.readJSONDocumentIterable(path);
    while(it.hasNext()) {
      Document doc = it.next();
      for(Annotation ann : doc.getAnnotations()) {
        ann.setSource(Annotation.Source.GOLD);
        ann.setConfidence(1.0);
      }
      if(!doc.isEmpty()) result.addDocument(doc);
      else log.warn("read empty document {}", doc.getId());
    }
    return result;
  }
  
  public static List readWikiDocumentsFromJSON(Resource path) throws IOException {
    log.info("Reading Wiki Articles from {}", path.toString());
    List result = new ArrayList<>();
    Iterator it = ObjectSerializer.getObjectMapper().readerFor(WikiDocument.class).readValues(path.getInputStream());
    while(it.hasNext()) {
      WikiDocument doc = it.next();
      for(Annotation ann : doc.getAnnotations()) {
        ann.setSource(Annotation.Source.GOLD);
      }
      result.add(doc);
    }
    return result;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy