de.datexis.sector.reader.MultiSegReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of texoo-sector Show documentation
Show all versions of texoo-sector Show documentation
TeXoo module for Topic Segmentation and Classification (SECTOR)
package de.datexis.sector.reader;
import de.datexis.common.InternalResource;
import de.datexis.common.Resource;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.preprocess.DocumentFactory;
import de.datexis.reader.RawTextDatasetReader;
import de.datexis.sector.model.SectionAnnotation;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Reader for MultiSeg dataset by Ivan Titov: "Multi-document topic segmentation". CIKM '10, Pages 1119-1128
* @author Sebastian Arnold
*/
public class MultiSegReader extends RawTextDatasetReader {
protected final static Logger log = LoggerFactory.getLogger(MultiSegReader.class);
@Override
public Dataset read(Resource path) throws IOException {
if(path.isDirectory()) {
return readDatasetFromDirectory(path, "^(.+?)\\.(\\d+)$");
} else if(path.isFile()) {
Document doc = readDocumentFromFile(path);
Dataset data = new Dataset(path.getFileName());
data.addDocument(doc);
return data;
} else throw new FileNotFoundException("cannot open path: " + path.toString());
}
/**
* Read a single Document from file.
*/
@Override
public Document readDocumentFromFile(Resource file) {
try(InputStream in = file.getInputStream()) {
CharsetDecoder utf8 = StandardCharsets.UTF_8.newDecoder();
BufferedReader br = new BufferedReader(new InputStreamReader(in));
Iterator it = new LineIterator(br);
int i = 0;
int n = 0;
Document doc = new Document();
doc.setId(file.getFileName());
doc.setSource(file.toString());
doc.setType("multiseg");
String line;
StringBuilder text = new StringBuilder();
Pattern filePattern = Pattern.compile("^(.+?/)(.+?)\\.(\\d+)$");
Matcher m = filePattern.matcher(file.toString());
if(!m.matches()) throw new IllegalArgumentException("invalid file name");
String basePath = m.group(1);
String baseDoc = m.group(2);
int docNum = Integer.parseInt(m.group(3));
Resource labels = file instanceof InternalResource ?
Resource.fromJAR(basePath + baseDoc + ".label") :
Resource.fromFile(basePath, baseDoc + ".label");
TreeSet[] sections = readSectionsFromLabel(labels, docNum);
while(it.hasNext()) {
line = it.next();
if(sections[0].contains(i)) {
// begin new section
text = new StringBuilder();
}
text.append(line).append("\n");
if(sections[1].contains(i)) {
// end the current section
String sectionText = text.toString();
if(sectionText.trim().length() > 0) {
addToDocument(sectionText, n++, doc);
}
text = new StringBuilder();
}
i++;
}
// end the last section - not required in case of begin-end
/*String sectionText = text.toString();
if(sectionText.trim().length() > 0) {
addToDocument(sectionText, n++, doc);
}*/
return doc;
} catch(IOException ex) {
// IOException is now allowed in Stream
log.error(ex.toString());
throw new RuntimeException(ex.toString(), ex.getCause());
}
}
private void addToDocument(String text, int sectionId, Document doc) {
if(text.trim().length() == 0) return;
//Document section = DocumentFactory.fromText(text, DocumentFactory.Newlines.KEEP);fromText(text, DocumentFactory.Newlines.KEEP);
Document section = new Document();
// we don't split sentences here but use entire paragraphs to stay comparable to the paper
for(String paragraph : text.split("\n")) {
//List tokens = DocumentFactory.createTokensFromText(paragraph.trim() + "\n");
Document temp = DocumentFactory.fromTokenizedText(paragraph.trim() + "\n");
section.addSentence(DocumentFactory.createSentenceFromTokens(temp.getTokens()));
}
String sectionHead = Integer.toString(sectionId);
doc.append(section);
SectionAnnotation sectionAnn = new SectionAnnotation(Annotation.Source.GOLD, doc.getType(), sectionHead);
sectionAnn.setSectionLabel(sectionHead); // heading is already set in constructor
sectionAnn.setBegin(section.getBegin());
sectionAnn.setEnd(section.getEnd());
doc.addAnnotation(sectionAnn);
}
/**
* Return a set of lines where a new sections starts.
*/
protected TreeSet[] readSectionsFromLabel(Resource file, int docNum) throws IOException {
TreeSet sectionStarts = new TreeSet<>();
TreeSet sectionEnds = new TreeSet<>();
//sectionStarts.add(0);
List> sections = new ArrayList<>();
try(InputStream in = file.getInputStream()) {
CharsetDecoder utf8 = StandardCharsets.UTF_8.newDecoder();
BufferedReader br = new BufferedReader(new InputStreamReader(in, utf8));
String line = null;
while ((line = br.readLine()) != null) {
StringTokenizer tokens = new StringTokenizer(line, " ");
String topicLabel = tokens.nextToken();
while (tokens.hasMoreTokens()) {
String token = tokens.nextToken();
String[] segs = token.split("::", -1);
int docId = Integer.parseInt(segs[0]);
String[] segPoints = segs[1].split("-", -1);
int start = Integer.parseInt(segPoints[0]);
int end = Integer.parseInt(segPoints[1]);
if(docId == docNum) {
sectionStarts.add(start);
sectionEnds.add(end);
sections.add(new AbstractMap.SimpleEntry<>(start, end));
}
}
}
}
return new TreeSet[] { sectionStarts, sectionEnds };
}
}