de.datexis.sector.reader.WikiSectionReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of texoo-sector Show documentation
Show all versions of texoo-sector Show documentation
TeXoo module for Topic Segmentation and Classification (SECTOR)
package de.datexis.sector.reader;
import de.datexis.common.ObjectSerializer;
import de.datexis.common.Resource;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.reader.DatasetReader;
import de.datexis.sector.model.WikiDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* Reads WikiSection Datasets from JSON file.
* @author Sebastian Arnold
*/
public class WikiSectionReader implements DatasetReader {
protected final static Logger log = LoggerFactory.getLogger(WikiSectionReader.class);
@Override
public Dataset read(Resource path) throws IOException {
return readDatasetFromJSON(path);
}
public static Dataset readDatasetFromJSON(Resource path) throws IOException {
log.info("Reading Wiki Articles from {}", path.toString());
Dataset result = new Dataset(path.getFileName().replace(".json", ""));
Iterator it = ObjectSerializer.readJSONDocumentIterable(path);
while(it.hasNext()) {
Document doc = it.next();
for(Annotation ann : doc.getAnnotations()) {
ann.setSource(Annotation.Source.GOLD);
ann.setConfidence(1.0);
}
if(!doc.isEmpty()) result.addDocument(doc);
else log.warn("read empty document {}", doc.getId());
}
return result;
}
public static List readWikiDocumentsFromJSON(Resource path) throws IOException {
log.info("Reading Wiki Articles from {}", path.toString());
List result = new ArrayList<>();
Iterator it = ObjectSerializer.getObjectMapper().readerFor(WikiDocument.class).readValues(path.getInputStream());
while(it.hasNext()) {
WikiDocument doc = it.next();
for(Annotation ann : doc.getAnnotations()) {
ann.setSource(Annotation.Source.GOLD);
}
result.add(doc);
}
return result;
}
}