de.datexis.parvec.encoder.ParVecIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of texoo-sector Show documentation
TeXoo module for Topic Segmentation and Classification (SECTOR)
There is a newer version: 1.3.3
package de.datexis.parvec.encoder;

import de.datexis.model.*;
import de.datexis.sector.model.SectionAnnotation;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.stream.Collectors;
import org.deeplearning4j.text.sentenceiterator.SentencePreProcessor;
import org.deeplearning4j.text.sentenceiterator.labelaware.LabelAwareSentenceIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An iterator that iterates over Sections (not Sentences).
 * @author Sebastian Arnold 
 */
public class ParVecIterator implements LabelAwareSentenceIterator {

  protected final static Logger log = LoggerFactory.getLogger(ParVecIterator.class);

  protected List documents;
  protected Iterator docIt;
  protected Document currentDoc;
  protected List sections;
  protected Iterator annIt;
  protected SectionAnnotation currentSection;
  
  protected int batchSize;
  protected boolean randomize;
  
  public ParVecIterator(Dataset data, boolean randomize) {
    this(data.getDocuments(), randomize);
  }

  public ParVecIterator(Collection docs, boolean randomize) {
    this.documents = new ArrayList<>(docs);
    this.randomize = randomize;
  }
  
  @Override
  public void reset() {
    currentSection = null;
    annIt = null;
    sections = new ArrayList<>(64);
    currentDoc = null;
    if(randomize) Collections.shuffle(documents, new Random(System.nanoTime()));
    docIt = documents.iterator();
  }
  
  @Override
  public boolean hasNext() {
    if(hasNextSection()) {
      return true;
    } else if(hasNextDocument()) {
      nextDocument();
      return hasNext();
    } else {
      return false;
    }
  }
  
  protected boolean hasNextSection() {
    return annIt != null && annIt.hasNext();
  }
  
  protected boolean hasNextDocument() {
    return docIt != null && docIt.hasNext();
  }
  
  public void nextDocument() {
    currentDoc = docIt.next();
    sections = currentDoc.streamAnnotations(Annotation.Source.GOLD, SectionAnnotation.class).collect(Collectors.toList());
    //log.trace("returning doc {} with {} sections.", currentDoc.getId(), sections.size());
    annIt = sections.iterator();
  }
  
  @Override
  public String nextSentence() {
    currentSection = annIt.next();
    String text = currentDoc
        .streamTokensInRange(currentSection.getBegin(), currentSection.getEnd(), true)
        .map(t -> t
            .getText()
            .trim()
            .replaceAll("\n", "*NL*")
            .replaceAll("\t", "*t*"))
        .collect(Collectors.joining(" "));
    return text;
  }

  @Override
  public String currentLabel() {
    return currentSection.getSectionLabel();
  }

  @Override
  public List currentLabels() {
    return Arrays.asList(currentSection.getSectionLabel());
  }

  @Override
  public void finish() {
    currentSection = null;
    annIt = null;
    sections.clear();
    currentDoc = null;
    docIt = null;
    documents.clear();
  }

  @Override
  public SentencePreProcessor getPreProcessor() {
    return null;
  }

  @Override
  public void setPreProcessor(SentencePreProcessor preProcessor) {
  }
  
}