de.julielab.genemapper.resources.SimConceptCorpusToIOBConverter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe
Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;
import de.julielab.genemapper.composites.CompositeMentionTokenizer;
import de.julielab.genemapper.composites.CompositeToken;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
public class SimConceptCorpusToIOBConverter {
public static void main(String[] args) throws IOException {
SimConceptCorpusToIOBConverter converter = new SimConceptCorpusToIOBConverter();
converter.convertPubTatorFormat(new File("/Users/faessler/Downloads/SimConcept/corpus/Disease.txt"),
new File("simconceptChemical.iob"),
new File("jcore-gene-mapper-ae/data/eval_data/bc2_data/test/test.genelist"));
}
public void convertPubTatorFormat(File simconceptInputFile, File iobOutputFile, File blacklistDocIdsFile) throws IOException {
Set blacklist = Collections.emptySet();
if (blacklistDocIdsFile != null) {
try (BufferedReader br = FileUtilities.getReaderFromFile(blacklistDocIdsFile)) {
blacklist = br.lines().map(line -> line.split("\\t")).map(split -> split[0]).collect(Collectors.toSet());
}
}
CompositeMentionTokenizer tokenizer = new CompositeMentionTokenizer();
final Set finalBlackList = blacklist;
try (BufferedReader br = FileUtilities.getReaderFromFile(simconceptInputFile);
BufferedWriter bw = FileUtilities.getWriterToFile(iobOutputFile)) {
List> geneList = br.lines()
.filter(line -> line.matches("^[0-9]+\t.*"))
.map(line -> line.split("\t"))
.filter(split -> !finalBlackList.contains(split[0]))
.map(split -> {
split[3] = tokenizer.tokenize(split[3]).map(CompositeToken::getText).collect(Collectors.joining(" "));
//System.out.println(split[3]);
return split;
})
.map(split -> new ImmutablePair<>(split[3], split[6])).collect(Collectors.toList());
for (Pair p : geneList) {
String[] split = p.getLeft().split("\\s");
if (split.length != p.getRight().length())
throw new IllegalStateException("The subtoken annotation does not match the created annotation: " + p);
char lastLabel = 'X';
for (int i = 0; i < split.length; i++) {
String s = split[i];
char c = p.getRight().charAt(i);
char iobLabel = c == lastLabel ? 'I' : 'B';
bw.write(s);
bw.write("\t");
bw.write(iobLabel + "-" + c);
bw.newLine();
lastLabel = c;
}
bw.newLine();
}
}
}
}