All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.eclipse.pass.loader.journal.nih.BatchJournalFinder Maven / Gradle / Ivy

There is a newer version: 1.12.0
Show newest version
/*
 * Copyright 2017 Johns Hopkins University
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.eclipse.pass.loader.journal.nih;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import org.eclipse.pass.support.client.PassClient;
import org.eclipse.pass.support.client.PassClientSelector;
import org.eclipse.pass.support.client.model.Journal;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Analyzes journals in our repository in order to match incoming journals against our existing journals
 *
 * @author [email protected]
 */
public class BatchJournalFinder implements JournalFinder {
    private static final Logger LOG = LoggerFactory.getLogger(BatchJournalFinder.class);

    private final Map> issnMap = new HashMap<>();
    private final Map> nlmtaMap = new HashMap<>();
    private final Map> nameMap = new HashMap<>();
    private final Set foundUris = new HashSet<>();

    private void load(PassClient client) throws IOException {
        LOG.info("Loading existing journals from PASS");

        AtomicInteger count = new AtomicInteger(0);

        PassClientSelector sel = new PassClientSelector<>(Journal.class);
        client.streamObjects(sel).forEach(j -> {

            count.incrementAndGet();

            j.getIssns().forEach(issn -> {
                update(issnMap, issn, j.getId());
            });

            update(nlmtaMap, j.getNlmta(), j.getId());
            update(nameMap, j.getJournalName(), j.getId());
        });

        LOG.info("Loaded " + count.get() + " existing journals");
    }

    private static void update(Map> map, String key, String value) {
        Set set = map.get(key);

        if (set == null) {
            set = new HashSet<>();
            map.put(key, set);
        }

        set.add(value);
    }

    /**
     * @param client the PassClient to use
     * @throws IOException on error
     */
    public BatchJournalFinder(PassClient client) throws IOException {

        LOG.info("Analyzing journals in PASS");

        load(client);

        LOG.info("Found {} existing ISSNs", issnMap.size());
        LOG.info("Found {} existing NLMTAs", nlmtaMap.size());
        LOG.info("Found {} existing NAMES", nameMap.size());
    }

    /**
     * @param nlmta the NLMTA supplied in out incoming journal data
     * @param name  the journal name
     * @param issns the list of issns
     * @return the URI string of the matching journal if found, null if nothing is found
     * or a directive to SKIP processing on this journal if the matching journal
     * has already been processed
     */
    @Override
    public synchronized String find(String nlmta, String name, List issns) {
        Set nlmtaUriSet = getIdsByNlmta(nlmta);
        Set nameUriSet = getIdsByName(name);

        Map uriScores = new HashMap<>();

        if (!issns.isEmpty()) {
            for (String issn : issns) {
                Set issnList = getIdsByIssn(issn);
                if (issnList != null) {
                    for (String uri : issnList) {
                        Integer i = uriScores.putIfAbsent(uri, 1);
                        if (i != null) {
                            uriScores.put(uri, i + 1);
                        }
                    }
                }
            }
        }

        if (nlmtaUriSet != null) {
            for (String uri : nlmtaUriSet) {
                Integer i = uriScores.putIfAbsent(uri, 1);
                if (i != null) {
                    uriScores.put(uri, i + 1);
                }
            }
        }

        if (nameUriSet != null) {
            for (String uri : nameUriSet) {
                Integer i = uriScores.putIfAbsent(uri, 1);
                if (i != null) {
                    uriScores.put(uri, i + 1);
                }
            }
        }

        if (uriScores.size() > 0) { //we have a possible uri - find out if it is matchy enough
            Integer highScore = Collections.max(uriScores.values());
            int minimumQualifyingScore = 2;
            List sortedUris = new ArrayList<>();

            for (int i = highScore; i >= minimumQualifyingScore; i--) {
                for (String uri : uriScores.keySet()) {
                    if (uriScores.get(uri) == i) {
                        sortedUris.add(uri);
                    }
                }
            }

            if (sortedUris.size() > 0) { // there are matching journals - decide if we have matched already
                String foundUri = null;
                for (String candidate : sortedUris) {
                    if (!foundUris.contains(candidate)) {
                        foundUri = candidate;
                        break;
                    }
                }
                if (foundUri != null) {
                    foundUris.add(foundUri);
                    return foundUri;
                } else { //this journal has been processed already
                    return "SKIP";
                }
            }
        }

        //nothing matches, create a new journal
        return null;
    }

    private synchronized Set getIdsByIssn(String issn) {
        if (issnMap.containsKey(issn)) {
            return issnMap.get(issn);
        }

        String[] parts = issn.split(":");

        if (parts.length == 2) {
            return issnMap.get(parts[1]);
        }

        return null;
    }

    private synchronized Set getIdsByNlmta(String nlmta) {
        if (nlmta != null && nlmta.length() > 0 && nlmtaMap.containsKey(nlmta)) {
            return nlmtaMap.get(nlmta);
        }

        return null;
    }

    private synchronized Set getIdsByName(String name) {
        if (name != null && name.length() > 0 && nameMap.containsKey(name)) {
            return nameMap.get(name);
        }

        return null;
    }

    @Override
    public synchronized void add(Journal j) {

        String uri = j.getId().toString();

        String nlmta = j.getNlmta();
        if (nlmta != null && nlmta.length() > 0) {
            LOG.debug("Adding nlmta " + nlmta);
            if (!nlmtaMap.containsKey(nlmta)) {
                nlmtaMap.put(nlmta, new HashSet<>());
            }
            nlmtaMap.get(nlmta).add(uri);
        }

        for (final String issn : j.getIssns()) {
            LOG.debug("Adding issn " + issn);
            if (!issnMap.containsKey(issn)) {
                issnMap.put(issn, new HashSet<>());
            }
            issnMap.get(issn).add(uri);
        }

        String name = j.getJournalName();
        if (name != null && name.length() > 0) {
            LOG.debug("Adding name " + name);
            if (!nameMap.containsKey(j.getJournalName())) {
                nameMap.put(name, new HashSet<>());
            }
            nameMap.get(name).add(uri);
        }

        foundUris.add(uri);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy