All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.datexis.model.Dataset Maven / Gradle / Ivy

package de.datexis.model;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.google.common.collect.Lists;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * Holds a collection of Documents in memory(!)
 * @author sarnold
 */
public class Dataset {
  
  private static final Logger log = LoggerFactory.getLogger(Dataset.class);
  
  /** The name of this dataset */
  private String name;
  
  /** The language of this dataset */
  private String language = null;
  
  /** The unique ID of this dataset (e.g. database primary key) */
  private Long uid = null;
  
  /** A list of all Documents in this dataset */
  private List documents;
  
  /** A list of Queries and their Results on this dataset */
  List queries = Collections.synchronizedList(new ArrayList<>());
  
  /** Random seed */
  protected static Random random = new Random();
  
  public Dataset() {
    this("");
  }
  
  public Dataset(String name) {
    this(name, Collections.synchronizedList(new ArrayList<>()));
  }
  
  public Dataset(String name, List docs) {
    this.documents = Collections.synchronizedList(docs);
    this.name = name;
  }
  
  public String getName() {
    return name;
  }
  
  public void setName(String name) {
    this.name = name;
  }
  
  @JsonInclude(JsonInclude.Include.NON_NULL)
  public String getLanguage() {
    return language;
  }
  
  public void setUid(Long uid) {
    this.uid = uid;
  }
  
  @JsonInclude(JsonInclude.Include.NON_NULL)
  public Long getUid() {
    return this.uid;
  }

  public void setLanguage(String language) {
    this.language = language;
  }
  
  /**
   * @return a Dataset that references to a split of documents. Caution: this is not a deep copy.
   */
  public Dataset getSplit(int offset, int count) {
    if(offset < 0) offset = countDocuments() + offset;
    if(count < 0) count = countDocuments() + count;
    List docs = streamDocuments(offset, count).collect(Collectors.toList());
    Dataset result = new Dataset(getName(), docs);
    result.setQueries(Lists.newArrayList(getQueries()));
    return result;
  }
  
  /**
   * @return all Documents in this Dataset in no particular order
   */
  public Collection getDocuments() {
    return documents;
  }
  
  /**
   * @return a Stream of all Documents in this Dataset
   */
  public Stream streamDocuments() {
    return documents.stream();
  }
  
  /**
   * @return a subset of Documents in this Dataset
   */
  public List getDocuments(int startIndex, int count) {
    return streamDocuments(startIndex, count)
      .collect(Collectors.toList());
  }
  
  /**
   * @return a Stream of Documents
   */
  public Stream streamDocuments(int startIndex, int count) {
    return streamDocuments()
      .skip(startIndex)
      .limit(count);
  }
  
  /**
   * @return the Document with given index
   */
  public Optional getDocument(int index) {
     return streamDocuments()
       .skip(index)
       .findFirst();
  }
  
  /**
   * Find a Document with given ID in the Dataset.
   * If multiple Documents exist with the same ID, only one is returned.
   * @return the Document with given ID
   */
  public Optional getDocument(String id) {
    return streamDocuments()
      .filter(doc -> doc.getId().equals(id))
      .findFirst();
  }
  
  /**
   * @return a random Document of this Dataset
   */
  @JsonIgnore
  public Optional getRandomDocument() {
    int index = random.nextInt(countDocuments());
    return getDocument(index);
  }

  public void randomizeDocuments() {
    Collections.shuffle(documents);
  }
  
  public void randomizeDocuments(long seed) {
    Collections.shuffle(documents, new Random(seed));
  }
  
  /**
   * @return stream over all Sentences in the Dataset. Caution: Boundaries are still given on Document level.
   */
  @JsonIgnore
  public Stream streamSentences() {
		return streamDocuments().flatMap(s -> s.streamSentences());
  }
  
  /**
   * @return stream over all Tokens in the Dataset. Caution: Boundaries are still given on Document level.
   */
  @JsonIgnore
  public Stream streamTokens() {
		return streamDocuments().flatMap(s -> s.streamTokens());
  }
  
  @JsonIgnore
  public  Stream getStream(Class spanClass) {
    if(spanClass == Sentence.class) return (Stream) streamSentences();
    else if(spanClass == Token.class) return (Stream) streamTokens();
    else return (Stream) streamTokens();
  }
  
  /**
   * Add a document to the end of this Dataset
   */
  public void addDocument(Document doc) {
    if(language == null) setLanguage(doc.getLanguage());
    documents.add(doc);
  }
  
  public void addDocumentFront(Document d) {
    documents.add(0, d);
  }
  
  /**
   * @return the number of Documents in this Dataset
   */
  public int countDocuments() {
    return documents.size();
  }
  
  /**
   * @return the number of Sentences in all Documents in this Dataset
   */
  public long countSentences() {
    return streamDocuments().mapToLong(d -> d.countSentences()).sum();
  }
  
  /**
   * @return the number of Tokens in all Documents in this Dataset
   */
  public long countTokens() {
    return streamDocuments().mapToLong(d -> d.countTokens()).sum();
  }
  
  /**
   * @return the number of Annotations in all Documents in this Dataset
   */
  public long countAnnotations() {
    return streamDocuments().mapToLong(d -> d.countAnnotations()).sum();
  }
  
  /**
   * @return the number of Queries in this Dataset
   */
  public long countQueries() {
    return getQueries().size();
  }
  
  /**
   * @return the number of Annotations from a given source in all Documents in this Dataset
   */
  public long countAnnotations(Annotation.Source source) {
    return streamDocuments().mapToLong(d -> d.countAnnotations(source)).sum();
  }
  
  /**
   * @return the number of Annotations from a given source in all Documents in this Dataset
   */
  public  long countAnnotations(Annotation.Source source, Class type) {
    return streamDocuments().mapToLong(d -> d.countAnnotations(source, type)).sum();
  }
  
  /**
   * @return a random Sentence from the Dataset
   */
  @JsonIgnore
  public Sentence getRandomSentence() {
    int index = random.nextInt(countDocuments());
    return getDocument(index).get().getRandomSentence();
  }
  
  public Collection getQueries() {
    return queries;
  }
  
  public void addQuery(Query q) {
    this.queries.add(q);
  }
  
  public Optional getQuery(String id) {
    return queries.stream()
      .filter(q -> q.getId().equals(id))
      .findFirst();
  }
  
  public void setQueries(List queries) {
    this.queries = queries;
  }
  
  /**
   * @return a deep copy of this Dataset (not fully implemented yet!)
   */
  public Dataset clone() {
    ArrayList docs = new ArrayList<>(countDocuments());
    for(Document doc : getDocuments()) {
      docs.add(doc.clone());
    }
    return new Dataset(getName(), docs);
  }

  @Override
  public boolean equals(Object o) {
    if(this == o) {
      return true;
    }
    if(!(o instanceof Dataset)) {
      return false;
    }
    
    Dataset dataset = (Dataset) o;
    return Objects.equals(getName(), dataset.getName()) &&
           Objects.equals(getLanguage(), dataset.getLanguage()) &&
           Objects.equals(getDocuments(), dataset.getDocuments());
  }

  @Override
  public int hashCode() {
    return Objects.hash(getName(), getLanguage(), getDocuments());
  }
}