de.julielab.genemapper.resources.uima.WikipediaIndexWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.
The newest version!
package de.julielab.genemapper.resources.uima;

import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.genemapper.WikipediaCategoryManager;
import de.julielab.genemapper.index.WikipediaIndexFields;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.jcore.ae.genemapper.desc.WikipediaFamilyParsing.Entity;
import de.julielab.jcore.ae.genemapper.desc.WikipediaFamilyParsing.EntityChunk;
import de.julielab.jcore.ae.genemapper.desc.WikipediaFamilyParsing.UnspecTitle;
import de.julielab.jcore.types.EntityMention;
import de.julielab.jcore.types.Header;
import de.julielab.jcore.types.wikipedia.Title;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;

@ResourceMetaData(name = "JCoRe GeneMapper Wikipedia Index Writer", description = "Expects CASes read by the GeneMapper Wikipedia Reader and processed by the entity class annotation RUTA analysis engine.. Creates an index of the read page excerpts and adds the entity class of the page as extracted by the RUTA component. This is supposed to help in the classification of gene/protein families and groups.")
public class WikipediaIndexWriter extends JCasAnnotator_ImplBase {
    public static final String PARAM_INDEX_DIRECTORY = "IndexDirectory";
    public static final String PARAM_REDIRECT_MAP = "RedirectMap";
    public static final String PARAM_WIKIPEDIA_CATEGORY_TREE_PATH = "WikipediaCategoryTreePath";

    private final static Logger log = LoggerFactory.getLogger(WikipediaIndexWriter.class);
    private static IndexWriter iw;
    private static Map> redirectMap = Collections.emptyMap();
    private static WikipediaCategoryManager wikipediaCategoryManager;
    @ConfigurationParameter(name = PARAM_INDEX_DIRECTORY, description = "The path for the index to be created. An already existing index will be overwritten.")
    private String indexDirectoryPath;
    @ConfigurationParameter(name = PARAM_REDIRECT_MAP, description = "Optional. File that maps page titles to the titles of pages redirecting to it. If given, those redirect titles are added to the 'title' field of the respective document.")
    private String redirectMapPath;
    @ConfigurationParameter(name = PARAM_WIKIPEDIA_CATEGORY_TREE_PATH, mandatory = false, description = "Optional. File created by GeNo's 'WikipediaCategoryTreeAndRedirectsExtractor' class that represents a map from page and category titles to categories they belong to. Will be used to filter for pages that are in some way related to the Molecular Biology category. Will also add the category path from Molecular Biology to the indexed page to the index.")
    private String wikipediaCategoryTreePath;
    private final TermNormalizer termNormalizer = new TermNormalizer();
    private final Set prohibitedMolecularBiologyPathElements = Set.of("Category:Water", "Category:Human geography", "Category:People", "Category:Bodies of water", "Category:Reasoning", "Category:Cognition", "Category:Cars", "Category:Aggression", "Category:Reproduction", "Category:Genealogy", "Category:Artificial intelligence", "Category:Taxa", "Category:Anatomy", "Category:Neuroscience", "Category:Human names", "Category:Botany", "Category:Philosophy of biology");

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        indexDirectoryPath = (String) context.getConfigParameterValue(PARAM_INDEX_DIRECTORY);
        redirectMapPath = (String) context.getConfigParameterValue(PARAM_REDIRECT_MAP);
        wikipediaCategoryTreePath = (String) context.getConfigParameterValue(PARAM_WIKIPEDIA_CATEGORY_TREE_PATH);
        synchronized (WikipediaIndexWriter.class) {
            try {
                Path indexPath = Path.of(indexDirectoryPath);
                File indexFile = indexPath.toFile();
                if (!indexFile.exists()) {
                    log.info("Creating index directory {}.", indexPath);
                    indexFile.mkdirs();
                }
                if (iw == null) {
                    IndexWriterConfig iwc = new IndexWriterConfig(new WhitespaceAnalyzer());
                    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
                    FSDirectory indexDirectory = FSDirectory.open(indexPath);
                    iw = new IndexWriter(indexDirectory, iwc);
                }
            } catch (IOException e) {
                log.error("IOException while initializing the index directory.", e);
                throw new ResourceInitializationException(e);
            }

            if (redirectMap == null) {
                try {
                    redirectMap = readRedirectMap(redirectMapPath);
                } catch (IOException e) {
                    log.error("IOException while reading the Wikipedia redirect map.", e);
                    throw new ResourceInitializationException(e);
                }
            }

            if (wikipediaCategoryTreePath != null && wikipediaCategoryManager == null) {
                String root = "Category:Biology";
                log.info("Creating Dijkstra tree for {}. Prohibited path elements: {}", root, prohibitedMolecularBiologyPathElements);
                wikipediaCategoryManager = new WikipediaCategoryManager(wikipediaCategoryTreePath, true);
                wikipediaCategoryManager.buildDijkstraTree(root);
            }
        }
    }

    private Map> readRedirectMap(String redirectMapPath) throws IOException {
        try (BufferedReader br = FileUtilities.getReaderFromFile(new File(redirectMapPath))) {
            // skip the header
            return br.lines().skip(1).map(line -> line.split("\\t")).collect(Collectors.toMap(s -> s[0], s -> {
                List l = new ArrayList<>();
                l.add(s[1].intern());
                return l;
            }, (l1, l2) -> {
                l1.addAll(l2);
                return l1;
            }));
        }
    }

    @Override
    public void collectionProcessComplete() throws AnalysisEngineProcessException {
        super.collectionProcessComplete();
        try {
            iw.close();
        } catch (IOException e) {
            log.error("Could not close index writer", e);
            throw new AnalysisEngineProcessException(e);
        }
    }

    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
        Document doc = createDocument(jCas);

        try {
            if (doc != null)
                iw.addDocument(doc);
        } catch (IOException e) {
            log.error("Could not index document {}", doc, e);
            throw new AnalysisEngineProcessException(e);
        }
    }

    private Document createDocument(JCas jCas) {
        Header header = JCasUtil.selectSingle(jCas, Header.class);
        Map> wikipediaTitleIndex = JCasUtil.indexCovered(jCas, EntityChunk.class, EntityMention.class);
        Optional pageTitleOpt = JCasUtil.select(jCas, Title.class).stream().findAny();
        Optional<UnspecTitle> unspecTitleOpt = JCasUtil.select(jCas, UnspecTitle.class).stream().findAny();
        Collection<EntityChunk> entityChunks = JCasUtil.select(jCas, EntityChunk.class);
        Collection<Entity> entities = JCasUtil.select(jCas, Entity.class);
        boolean titleIsInPlural = false;

        List<String> path = null;
        if (wikipediaCategoryManager != null) {
            path = wikipediaCategoryManager.getShortestPathToDijkstraTreeRoot(header.getTitle(), prohibitedMolecularBiologyPathElements);
            if (path.isEmpty()) {
                log.debug("Skipping page {} because no path to the category graph root was found.", header.getTitle());
                return null;
            }
        }

        if (pageTitleOpt.isPresent()) {
            Title pageTitle = pageTitleOpt.get();
            if ((pageTitle.getEnd() + 1 < jCas.getDocumentText().length() && jCas.getDocumentText().charAt(pageTitle.getEnd()) == 's')
                    || (pageTitle.getEnd() + 2 < jCas.getDocumentText().length() && jCas.getDocumentText().charAt(pageTitle.getEnd()) == 'e' && jCas.getDocumentText().charAt(pageTitle.getEnd() + 1) == 's'))
                titleIsInPlural = true;
        }

        Document doc = new Document();
        doc.add(new StringField(WikipediaIndexFields.PAGE_ID, header.getDocId(), Field.Store.YES));
        doc.add(new TextField(WikipediaIndexFields.TITLE, termNormalizer.normalize(header.getTitle()), Field.Store.NO));
        doc.add(new StoredField(WikipediaIndexFields.TITLE, header.getTitle()));
        for (String redirectTitle : redirectMap.getOrDefault(header.getTitle(), Collections.emptyList())) {
            doc.add(new TextField(WikipediaIndexFields.TITLE, termNormalizer.normalize(redirectTitle), Field.Store.NO));
            doc.add(new StoredField(WikipediaIndexFields.TITLE, redirectTitle));
        }
        if (unspecTitleOpt.isPresent())
            doc.add(new StringField(WikipediaIndexFields.HASUNSPECTITLE, "true", Field.Store.YES));
        if (titleIsInPlural)
            doc.add(new StringField(WikipediaIndexFields.TITLEISINPLURAL, "true", Field.Store.YES));
        for (EntityChunk chunk : entityChunks) {
            doc.add(new TextField(WikipediaIndexFields.ENTITYCHUNKS, termNormalizer.normalize(chunk.getCoveredText()), Field.Store.YES));
        }
        for (Entity entity : entities) {
            doc.add(new TextField(WikipediaIndexFields.ENTITIES, termNormalizer.normalize(entity.getCoveredText()), Field.Store.YES));
        }
        for (EntityChunk chunk : entityChunks) {
            Collection<EntityMention> wikipediaTitleInEntityChunk = wikipediaTitleIndex.get(chunk);
            for (EntityMention wikipediaTitle : wikipediaTitleInEntityChunk) {
                doc.add(new TextField(WikipediaIndexFields.MENTIONED_PAGE_TITLES, termNormalizer.normalize(wikipediaTitle.getCoveredText()), Field.Store.NO));
                doc.add(new StoredField(WikipediaIndexFields.MENTIONED_PAGE_TITLES, wikipediaTitle.getCoveredText()));
            }
        }
        if (wikipediaCategoryManager != null) {
//            System.out.println("Writing path to wikipedia page " + pageTitleOpt.orElse(new Title(jCas)).getCoveredText() + ": " + path);
            for (String pathElement : path) {
                doc.add(new TextField(WikipediaIndexFields.MOLECULAR_BIOLOGY_PATH, termNormalizer.normalize(pathElement), Field.Store.NO));
                doc.add(new StoredField(WikipediaIndexFields.MOLECULAR_BIOLOGY_PATH, pathElement));
            }
            doc.add(new IntPoint(WikipediaIndexFields.MOLECULAR_BIOLOGY_PATH_LENGTH, path.size()));
        } else {
            System.out.println("WikiCategoryManager is null!");
        }
        return doc;
    }
}
</code></pre>    <br/>
    <br/>
<div class='clear'></div>
</main>
</div>
<br/><br/>
    <div class="align-center">© 2015 - 2025 <a href="/legal-notice.php">Weber Informatics LLC</a> | <a href="/data-protection.php">Privacy Policy</a></div>
<br/><br/><br/><br/><br/><br/>
</body>
</html>