de.julielab.genemapper.resources.TransformerDisambiguationBC3DataWriter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe
Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;
import de.julielab.geneexpbase.data.DocumentSourceFileRegistry;
import de.julielab.geneexpbase.data.DocumentSourceFiles;
import de.julielab.geneexpbase.genemodel.GeneDocument;
import de.julielab.genemapper.Configuration;
import de.julielab.genemapper.GeneMapper;
import de.julielab.genemapper.classification.TransformerDisambiguationDataUtils;
import de.julielab.genemapper.utils.GeneMapperException;
import de.julielab.genemapper.utils.GeneMapperInitializationException;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class TransformerDisambiguationBC3DataWriter {
private final static Logger log = LoggerFactory.getLogger(TransformerDisambiguationBC3DataWriter.class);
public static void main(String[] args) throws IOException, GeneMapperException, ExecutionException, GeneMapperInitializationException {
Configuration configuration = new Configuration(new File("data/eval_data/genemapper_for_disambig_opt.properties"));
DocumentSourceFiles documentSourceFiles = DocumentSourceFileRegistry.bc3Trainset1InferredMentionIds();
// TODO use Guice injector
GeneMapper geneMapper = null;//new GeneMapper(configuration);
String goldTaxMode = TransformerDisambiguationDataUtils.USE_GOLD_TAX_FOR_CANDIDATE_RETRIEVAL ? "goldTax" : "noGoldTax";
File outputFile = new File("transformerDisambiguationData-bc3trainset1-v"+ TransformerDisambiguationDataUtils.VERSION+"-" + goldTaxMode+ ".tsv");
File corpusSplitMapping = new File("splitmappings/bc3-trainset1-10split-5devfreq.txt");
createDisambiguationData(documentSourceFiles, geneMapper, outputFile, corpusSplitMapping);
}
public static void createDisambiguationData(DocumentSourceFiles sourceFiles, GeneMapper mapper, File outputFile, File corpusSplitMapping) throws IOException, GeneMapperException, ExecutionException {
// Split mappings are created when running SmacOptimizationRoute implementations. They store a file mapping each
// document in the corpus to a partition.
// Format: docIdpartition
// We look for the documents in the "dev" partition to remove them from the training material
Set devDocIds = FileUtils.readLines(corpusSplitMapping, StandardCharsets.UTF_8).stream().map(line -> line.split("\\s+")).filter(s -> s[1].equals("dev")).map(s -> s[0]).collect(Collectors.toSet());
log.info("Writing transformer training data for corpus {} to {}", sourceFiles.getName(), outputFile);
String outputPath = outputFile.getAbsolutePath();
File devFile = new File(outputPath.substring(0, outputPath.lastIndexOf('.'))+"-dev.tsv");
log.info("Got {} dev docs from {} that will be omitted from the training data and written to {}.", devDocIds.size(), corpusSplitMapping, devFile);
// TODO use Guice injector
List documents = null;//DocumentLoader.getDocuments(sourceFiles, mapper.getConfiguration()).collect(Collectors.toList());
Stream trainStream = documents.stream().filter(d -> !devDocIds.contains(d.getId()));
Stream devStream = documents.stream().filter(d -> devDocIds.contains(d.getId()));
TransformerDisambiguationDataUtils.writeData(mapper, outputFile, trainStream);
TransformerDisambiguationDataUtils.writeData(mapper, devFile, devStream);
}
}