
edu.jhu.hlt.concrete.ingesters.alnc.ALNCIngester Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of concrete-ingesters-alnc Show documentation
Show all versions of concrete-ingesters-alnc Show documentation
Library providing ingesters and utilities for converting the American Language News Corpus (ALNC) to the Concrete NLP data schema.
/*
* Copyright 2012-2015 Johns Hopkins University HLTCOE. All rights reserved.
* See LICENSE in the project root directory.
*/
package edu.jhu.hlt.concrete.ingesters.alnc;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import edu.jhu.hlt.alnc.ALNCArticleBean;
import edu.jhu.hlt.alnc.ALNCFileConverter;
import edu.jhu.hlt.concrete.Communication;
import edu.jhu.hlt.concrete.ingesters.base.IngestException;
import edu.jhu.hlt.concrete.ingesters.base.stream.IteratorBasedStreamIngester;
import edu.jhu.hlt.concrete.util.ConcreteException;
import edu.jhu.hlt.concrete.util.ProjectConstants;
import edu.jhu.hlt.concrete.util.Timing;
/**
* Class that allows mapping of ALNC documents into Concrete {@link Communication}
* objects.
*/
public class ALNCIngester implements IteratorBasedStreamIngester, AutoCloseable {
private final long ts;
private final Path path;
private final ALNCFileConverter conv;
public ALNCIngester(Path path) throws IngestException {
this.ts = Timing.currentLocalTime();
this.path = path;
try {
if (Files.probeContentType(this.path).contains("bzip"))
this.conv = new ALNCFileConverter(new BZip2CompressorInputStream(Files.newInputStream(this.path)));
else
this.conv = new ALNCFileConverter(Files.newInputStream(this.path));
} catch (IOException e) {
throw new IngestException(e);
}
}
/* (non-Javadoc)
* @see edu.jhu.hlt.concrete.ingesters.base.Ingester#getKind()
*/
@Override
public String getKind() {
return "news";
}
/* (non-Javadoc)
* @see edu.jhu.hlt.concrete.safe.metadata.SafeAnnotationMetadata#getTimestamp()
*/
@Override
public long getTimestamp() {
return this.ts;
}
/* (non-Javadoc)
* @see edu.jhu.hlt.concrete.metadata.tools.MetadataTool#getToolName()
*/
@Override
public String getToolName() {
return ALNCIngester.class.getSimpleName();
}
/* (non-Javadoc)
* @see edu.jhu.hlt.concrete.metadata.tools.MetadataTool#getToolNotes()
*/
@Override
public List getToolNotes() {
List notes = new ArrayList<>();
notes.add("Original file path: " + this.path.toString());
return notes;
}
/* (non-Javadoc)
* @see edu.jhu.hlt.concrete.metadata.tools.MetadataTool#getToolVersion()
*/
@Override
public String getToolVersion() {
return ProjectConstants.VERSION;
}
@Override
public Iterator iterator() throws IngestException {
try {
return new ALNCCommunicationIterator(this.conv);
} catch (IOException e) {
throw new IngestException(e);
}
}
private static class ALNCCommunicationIterator implements Iterator {
private final Iterator iterator;
private ALNCCommunicationIterator(ALNCFileConverter conv) throws IOException {
this.iterator = conv.stream().iterator();
}
@Override
public boolean hasNext() {
return this.iterator.hasNext();
}
@Override
public Communication next() {
try {
return new CommunicationizableALNCArticle(this.iterator.next()).toCommunication();
} catch (ConcreteException e) {
throw new RuntimeException("Error mapping documents from ALNC to Concrete.", e);
}
}
}
@Override
public void close() throws IngestException {
try {
this.conv.close();
} catch (IOException e) {
throw new IngestException(e);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy