de.datexis.sector.reader.Wiki727Reader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of texoo-sector Show documentation
TeXoo module for Topic Segmentation and Classification (SECTOR)
There is a newer version: 1.3.3
package de.datexis.sector.reader;

import de.datexis.common.Resource;
import de.datexis.model.*;
import de.datexis.preprocess.DocumentFactory;
import de.datexis.reader.RawTextDatasetReader;
import de.datexis.sector.model.SectionAnnotation;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Reader for Wiki-727l Dataset by Omri Koshorek, Adir Cohen, Noam Mor, Michael
 * Rotman, and Jonathan Berant. 2018. "Text segmentation as a supervised learning task."
 * In Proceedings of the 2018 Conference of the North American Chapter of the Association for
 * Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers), volume 2,
 * pages 469–473.
 * @author Sebastian Arnold 
 */
public class Wiki727Reader extends RawTextDatasetReader {

  protected final static Logger log = LoggerFactory.getLogger(Wiki727Reader.class);
  
  protected int sectionLevel = 2;
  protected boolean skipPrefaceText = false;
  protected boolean skipPrefaceAnnotation = false;
  
  protected Pattern SECTION_PATTERN = Pattern.compile("^========,(\\d+),(.+?)\\.$");
  
  /**
   * Create Annotations down to a given level.
   * 0 - include all section
   * 1 - the whole document  as one section
   * 2 - include 
 sections
   * 3 - include  subsections
   * etc.
   */
  public Wiki727Reader withSectionLevel(int level) {
    this.sectionLevel = level;
    return this;
  }
  
  public Wiki727Reader withSkipPreface(boolean skip) {
    this.skipPrefaceText = skip;
    return this;
  }
  
  /**
   * Read a single Document from file.
   */
  @Override
  public Document readDocumentFromFile(Resource file) {

    try(InputStream in = file.getInputStream()) {
      CharsetDecoder utf8 = StandardCharsets.UTF_8.newDecoder();
      BufferedReader br = new BufferedReader(new InputStreamReader(in, utf8));
      Iterator it = new LineIterator(br);
      int k = 0;
      int j = 0;
      int length = 0;
      
      Document doc = new Document();
      doc.setId(file.getFileName());
      doc.setSource(file.toString());
      doc.setType("wiki");
      
      String line;
      StringBuilder text = new StringBuilder();
      SectionAnnotation ann = new SectionAnnotation(Annotation.Source.GOLD);
      String sectionHeading = "";
      
      while(it.hasNext()) {
        
        line = it.next();
        Matcher matcher = SECTION_PATTERN.matcher(line);
        
        if(line.startsWith("=====") && matcher.matches()) {
          int level = Integer.parseInt(matcher.group(1));
          String heading = matcher.group(2);
          
          // check heading depth
          if(sectionLevel == 0 || level <= sectionLevel) {
            
            // end the current section
            String sectionText = text.toString();
            if(sectionText.trim().length() > 0) {
              addToDocument(sectionText, sectionHeading, doc);
            }
            
            // rebuild heading for next section
            int split = 0;
            while(--level > 1) split = sectionHeading.indexOf(" | ", split + 1);
            if(split > 0) sectionHeading = sectionHeading.substring(0, split) + " | "; // remove
            else if(split < 0) sectionHeading = sectionHeading + " | "; // add
            else sectionHeading = ""; // reset
            sectionHeading += heading;
            
            text = new StringBuilder();
            
          }
          
        } else {
          
          if(text.length() > 0) text.append(" ");
          line = line
              .replaceAll("\\*\\*\\*LIST\\*\\*\\*", "")
              .replaceAll("\\*\\*\\*formula\\*\\*\\*", "")
              .replaceAll("\\*\\*\\*codice\\*\\*\\*", "");
          if(!line.trim().isEmpty()) {
            text.append(line).append("\n");
          }
          
        }
        
      } 
      
      // end the last section
      String sectionText = text.toString();
      if(sectionText.trim().length() > 0) {
        addToDocument(sectionText, sectionHeading, doc);
      }

      return doc;
      
    } catch(IOException ex) {
      // IOException is now allowed in Stream
      log.error(ex.toString());
      throw new RuntimeException(ex.toString(), ex.getCause());
    }
    
  }

  
  private void addToDocument(String text, String sectionHeading, Document doc) {
    if(text.trim().length() == 0) return;
    //Document section = DocumentFactory.fromText(text, DocumentFactory.Newlines.DISCARD);
    Document section = new Document();
    // we don't split sentences here but use entire paragraphs to stay comparable to the paper
    for(String paragraph : text.split("\n")) {
      //List tokens = DocumentFactory.createTokensFromText(paragraph.trim() + "\n");
      if(paragraph.trim().isEmpty()) continue;
      Document temp = DocumentFactory.fromText(paragraph.trim() + "\n", DocumentFactory.Newlines.KEEP);
      section.addSentence(DocumentFactory.createSentenceFromTokens(temp.getTokens()));
    }
    if(sectionHeading == null) return;
    String sectionHead = sectionHeading.replaceFirst("\\|.+$","").trim().toLowerCase();
    if(skipPrefaceText && sectionHead.equals("preface")) {
      //doc.setAbstract(section.getText());
    } else {
      doc.append(section);
      SectionAnnotation sectionAnn = new SectionAnnotation(Annotation.Source.GOLD, doc.getType(), sectionHeading);
      sectionAnn.setSectionLabel(sectionHeading); // heading is already set in constructor
      sectionAnn.setBegin(section.getBegin());
      sectionAnn.setEnd(section.getEnd());
      doc.addAnnotation(sectionAnn);
    }
  }
  
  

}
de.datexis.sector.reader.Wiki727Reader Maven / Gradle / Ivy

as one section * 2 - include

sections * 3 - include