All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.genemapper.resources.SimConceptCorpusToIOBConverter Maven / Gradle / Ivy

Go to download

This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.

The newest version!
package de.julielab.genemapper.resources;

import de.julielab.genemapper.composites.CompositeMentionTokenizer;
import de.julielab.genemapper.composites.CompositeToken;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

public class SimConceptCorpusToIOBConverter {

    public static void main(String[] args) throws IOException {
        SimConceptCorpusToIOBConverter converter = new SimConceptCorpusToIOBConverter();
        converter.convertPubTatorFormat(new File("/Users/faessler/Downloads/SimConcept/corpus/Disease.txt"),
                new File("simconceptChemical.iob"),
                new File("jcore-gene-mapper-ae/data/eval_data/bc2_data/test/test.genelist"));
    }

    public void convertPubTatorFormat(File simconceptInputFile, File iobOutputFile, File blacklistDocIdsFile) throws IOException {
        Set blacklist = Collections.emptySet();
        if (blacklistDocIdsFile != null) {
            try (BufferedReader br = FileUtilities.getReaderFromFile(blacklistDocIdsFile)) {
                blacklist = br.lines().map(line -> line.split("\\t")).map(split -> split[0]).collect(Collectors.toSet());
            }
        }
        CompositeMentionTokenizer tokenizer = new CompositeMentionTokenizer();
        final Set finalBlackList = blacklist;
        try (BufferedReader br = FileUtilities.getReaderFromFile(simconceptInputFile);
             BufferedWriter bw = FileUtilities.getWriterToFile(iobOutputFile)) {
            List> geneList = br.lines()
                    .filter(line -> line.matches("^[0-9]+\t.*"))
                    .map(line -> line.split("\t"))
                    .filter(split -> !finalBlackList.contains(split[0]))
                    .map(split -> {
                        split[3] = tokenizer.tokenize(split[3]).map(CompositeToken::getText).collect(Collectors.joining(" "));
                        //System.out.println(split[3]);
                        return split;
                    })
                    .map(split -> new ImmutablePair<>(split[3], split[6])).collect(Collectors.toList());
            for (Pair p : geneList) {
                String[] split = p.getLeft().split("\\s");
                if (split.length != p.getRight().length())
                    throw new IllegalStateException("The subtoken annotation does not match the created annotation: " + p);
                char lastLabel = 'X';
                for (int i = 0; i < split.length; i++) {
                    String s = split[i];
                    char c = p.getRight().charAt(i);
                    char iobLabel = c == lastLabel ? 'I' : 'B';

                    bw.write(s);
                    bw.write("\t");
                    bw.write(iobLabel + "-" + c);
                    bw.newLine();

                    lastLabel = c;
                }
                bw.newLine();
            }
        }
    }




}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy