All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.gwdg.metadataqa.marc.MarcFieldExtractor Maven / Gradle / Ivy

package de.gwdg.metadataqa.marc;

import com.jayway.jsonpath.InvalidJsonException;
import de.gwdg.metadataqa.api.counter.FieldCounter;
import de.gwdg.metadataqa.api.interfaces.Calculator;
import de.gwdg.metadataqa.api.interfaces.MetricResult;
import de.gwdg.metadataqa.api.model.pathcache.JsonPathCache;
import de.gwdg.metadataqa.api.model.XmlFieldInstance;
import de.gwdg.metadataqa.api.model.pathcache.PathCache;
import de.gwdg.metadataqa.api.schema.Schema;
import de.gwdg.metadataqa.api.util.CompressionLevel;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.logging.Logger;

import de.gwdg.metadataqa.marc.dao.Control007;
import de.gwdg.metadataqa.marc.dao.Control008;
import de.gwdg.metadataqa.marc.dao.Leader;
import de.gwdg.metadataqa.marc.definition.general.codelist.CodeList;
import de.gwdg.metadataqa.marc.definition.general.codelist.LanguageCodes;
import de.gwdg.metadataqa.marc.definition.general.codelist.OrganizationCodes;
import org.apache.commons.lang3.StringUtils;

/**
 *
 * @author Péter Király 
 */
public class MarcFieldExtractor implements Calculator, Serializable {

  private static final Logger logger = Logger.getLogger(MarcFieldExtractor.class.getCanonicalName());

  public static final String CALCULATOR_NAME = "fieldExtractor";
  public static final String LEADER_KEY = "leader";
  private static final List authorFields = Arrays.asList("100$a", "110$a", "700$a", "710$a");

  public static final String FIELD_NAME = "recordId";
  private String idPath;
  protected FieldCounter> resultMap;
  protected Schema schema;
  private String recordId;
  private Leader leader;
  private Control007 x007;
  private Control008 x008;
  private Map duplumKeyMap;
  private List titleWords;
  private List authorWords;
  private String duplumKeyType;
  private List dateOfPublication;
  private List isbn;
  private String publisherOrDistributorNumber;
  private String abbreviatedNameOfPublisher;
  private String numberOfPart;
  private String nameOfPart;
  private String extent;
  private String musicalPresentationStatement;
  private String volumeDesignation;
  private String relatedParts;
  private List systemControlNumbers;
  private Map oclcMap;
  private boolean valid;

  public MarcFieldExtractor() {
  }

  public MarcFieldExtractor(Schema schema) {
    this.schema = schema;
    setIdPath(schema.getExtractableFields().get("001"));
  }

  public MarcFieldExtractor(String idPath) {
    this.idPath = idPath;
  }

  @Override
  public String getCalculatorName() {
    return CALCULATOR_NAME;
  }

  public void measure(JsonPathCache cache)
        throws InvalidJsonException {
    valid = true;
    resultMap = new FieldCounter<>();
    duplumKeyMap = null;
    recordId = null;
    leader = null;
    x007 = null;
    x008 = null;
    titleWords = null;
    authorWords = null;
    duplumKeyType = null;
    dateOfPublication = null;
    isbn = null;
    publisherOrDistributorNumber = null;
    abbreviatedNameOfPublisher = null;
    numberOfPart = null;
    nameOfPart = null;
    extent = null;
    musicalPresentationStatement = null;
    volumeDesignation = null;
    relatedParts = null;
    systemControlNumbers = null;
    oclcMap = null;

    recordId = ((List) cache.get(getIdPath())).get(0).getValue();
    cache.setRecordId(recordId);
    resultMap.put(FIELD_NAME, Arrays.asList(recordId));
    if (schema != null) {
      String path;
      for (String fieldName : schema.getExtractableFields().keySet()) {
        if (!fieldName.equals(FIELD_NAME)) {
          path = schema.getExtractableFields().get(fieldName);
          List instances = cache.get(path);
          List values = null;
          if (!isNull(instances)) {
            values = new ArrayList<>();
            for (XmlFieldInstance instance : instances) {
              values.add(instance.getValue());
            }
            if (fieldName.equals(LEADER_KEY)) {
              leader = new Leader(values.get(0));
            }
          }
          resultMap.put(fieldName, values);
        }
      }
    }
    processLeader();
    process007();
    process008();
    processType();
    processTitleWords();
    processAuthorWords();
    processDateOfPublication();
    processIsbn();
    processPublisherOrDistributorNumber();
    processAbbreviatedNameOfPublisher();
    processNumberOfPart();
    processNameOfPart();
    processExtent();
    processMusicalPresentationStatement();
    processVolumeDesignation();
    processRelatedParts();
    processSystemControlNumbers();
    processOclcFields();
    createDuplumKeyMap();
  }

  private static boolean isNull(List values) {
    return values == null
          || values.isEmpty()
          || values.get(0) == null
          || values.get(0).getValue() == null;
  }

  public String getIdPath() {
    return idPath;
  }

  public void setIdPath(String idPath) {
    this.idPath = idPath;
  }

  @Override
  public List measure(PathCache pathCache) {
    return null;
  }

  // @Override
  public Map getResultMap() {
    return resultMap.getMap();
  }

  // @Override
  public Map> getLabelledResultMap() {
    Map> labelledResultMap = new LinkedHashMap<>();
    labelledResultMap.put(getCalculatorName(), resultMap.getMap());
    return labelledResultMap;
  }

  // @Override
  public String getCsv(boolean withLabel, CompressionLevel compressionLevel) {
    return resultMap.getCsv(withLabel, CompressionLevel.ZERO); // the extracted fields should never be compressed!
  }

  @Override
  public List getHeader() {
    List headers = new ArrayList<>();
    headers.add(FIELD_NAME);
    return headers;
  }

  public void processLeader() {
    if (resultMap.has(LEADER_KEY))
      leader = new Leader(resultMap.get(LEADER_KEY).get(0));
    else
      logger.severe(String.format("No leader in result map. Nr of existing vars: %s",
          StringUtils.join(resultMap.getMap().keySet(), ", ")));
  }

  public void process007() {
    if (resultMap.get("007") == null) {
      valid = false;
    } else {
      x007 = new Control007(resultMap.get("007").get(0));
    }
  }

  public void process008() {
    if (resultMap.get("008") != null 
        && StringUtils.isNotBlank(resultMap.get("008").get(0)))
      x008 = new Control008(resultMap.get("008").get(0), leader.getType());
  }

  private void processTitleWords() {
    titleWords = extractWords(StringUtils.join(resultMap.get("245$a"), " "), 3);
  }

  private void processType() {
    String typeOfRecord = leader.getByLabel("Type of record");
    String bibliographicLevel = leader.getByLabel("Bibliographic level");
    if (typeOfRecord.equals("a") && bibliographicLevel.equals("s")) {
      duplumKeyType = "p";
    } else if (bibliographicLevel.equals("d")) {
      duplumKeyType = "s";
    } else if (bibliographicLevel.equals("a") || bibliographicLevel.equals("b")) {
      duplumKeyType = "a";
    } else {
      duplumKeyType = "m";
    }
  }

  public List extractWords(String text, int length) {
    List tokens = new ArrayList<>();
    if (StringUtils.isBlank(text))
      return tokens;

    var st = new StringTokenizer(text);
    while (st.hasMoreTokens())
      tokens.add(st.nextToken());

    var max = Math.min(length, tokens.size());
    return tokens.subList(0, max);
  }

  public String getRecordId() {
    return recordId;
  }

  public Leader getLeader() {
    return leader;
  }

  public Control007 getX007() {
    return x007;
  }

  public Control008 getX008() {
    return x008;
  }

  public List getTitleWords() {
    return titleWords;
  }

  public List getAuthorWords() {
    return authorWords;
  }

  public String getDuplumKeyType() {
    return duplumKeyType;
  }

  public List getDateOfPublication() {
    return dateOfPublication;
  }

  public List getIsbn() {
    return isbn;
  }

  public String getPublisherOrDistributorNumber() {
    return publisherOrDistributorNumber;
  }

  public String getAbbreviatedNameOfPublisher() {
    return abbreviatedNameOfPublisher;
  }

  public String getNumberOfPart() {
    return numberOfPart;
  }

  public String getNameOfPart() {
    return nameOfPart;
  }

  public String getExtent() {
    return extent;
  }

  public String getMusicalPresentationStatement() {
    return musicalPresentationStatement;
  }

  public String getVolumeDesignation() {
    return volumeDesignation;
  }

  public String getRelatedParts() {
    return relatedParts;
  }

  private void processAuthorWords() {
    String author = extractAuthor();
    authorWords = extractWords(author, 3);
  }

  private String extractAuthor() {
    String author = null;
    for (String field : authorFields) {
      Object value = resultMap.get(field);
      String stringValue;
      if (value instanceof List) {
        stringValue = StringUtils.join((List)value, " ");
      } else {
        stringValue = (String)value;
      }
      if (StringUtils.isNotBlank(stringValue)) {
        author = stringValue;
        break;
      }
    }
    return author;
  }

  private void processDateOfPublication() {
    dateOfPublication = resultMap.get("260$c");
  }

  private void processIsbn() {
    isbn = resultMap.get("020$a");
  }

  private void processPublisherOrDistributorNumber() {
    publisherOrDistributorNumber = duplumKeyType.equals("m")
        ? null : StringUtils.join(resultMap.get("028$a"), "; ");
  }

  private void processAbbreviatedNameOfPublisher() {
    abbreviatedNameOfPublisher = StringUtils.join(resultMap.get("060$b"), "; ");
  }

  private void processNumberOfPart() {
    numberOfPart = StringUtils.join(resultMap.get("245$n"), "; ");
  }

  private void processNameOfPart() {
    nameOfPart = StringUtils.join(resultMap.get("245$p"), "; ");
  }

  private void processExtent() {
    extent = StringUtils.join(resultMap.get("300$a"), "; ");
  }

  private void processMusicalPresentationStatement() {
    musicalPresentationStatement = StringUtils.join(resultMap.get("254$a"), "; ");
  }

  private void processVolumeDesignation() {
    volumeDesignation = StringUtils.join(resultMap.get("490$v"), "; ");
  }

  private void processRelatedParts() {
    relatedParts = StringUtils.join(resultMap.get("773$g"), "; ");
  }

  private void processOclcFields() {
    oclcMap = new LinkedHashMap<>();
    oclcMap.put("oclcLibraryIdentifier", resolve(resultMap.get("029$a"), OrganizationCodes.getInstance()));
    oclcMap.put("otherSystemControlNumber", resultMap.get("029$b"));
    oclcMap.put("catalogingAgency", resolve(resultMap.get("040$a"), OrganizationCodes.getInstance()));
    oclcMap.put("languageOfCataloging", resolve(resultMap.get("040$b"), LanguageCodes.getInstance()));
    oclcMap.put("transcribingAgency", resolve(resultMap.get("040$c"), OrganizationCodes.getInstance()));
    oclcMap.put("modifyingAgency", resolve(resultMap.get("040$d"), OrganizationCodes.getInstance()));
    oclcMap.put("topicalTerm", resultMap.get("650$a"));
    oclcMap.put("manifestId", resultMap.get("911$9"));
    oclcMap.put("workId", resultMap.get("912$9"));
    oclcMap.put("placeOfPublication", resultMap.get("260$a"));
    oclcMap.put("nameOfPublisher", resultMap.get("260$b"));
    oclcMap.put("sourceOfHeading", resultMap.get("650$2"));
    oclcMap.put("title", resultMap.get("245$a"));
  }

  private Object resolve(List list, CodeList codeService) {
    if (list == null || list.isEmpty())
      return list;

    List resolvedList = new ArrayList<>();
    for (String code : list)
      if (codeService.isValid(code))
        resolvedList.add(codeService.getCode(code).getLabel());
      else
        resolvedList.add(code);
    return resolvedList;
  }

  public Map getDuplumKeyMap() {
    if (duplumKeyMap == null) {
      createDuplumKeyMap();
    }
    return duplumKeyMap;
  }

  public void createDuplumKeyMap() {
    duplumKeyMap = new HashMap<>();
    duplumKeyMap.put("recordId", recordId);
    duplumKeyMap.put("titleWords", titleWords);
    duplumKeyMap.put("authorWords", authorWords);
    duplumKeyMap.put("duplumKeyType", duplumKeyType);
    duplumKeyMap.put("dateOfPublication", dateOfPublication);
    duplumKeyMap.put("isbn", isbn);
    duplumKeyMap.put("publisherOrDistributorNumber", publisherOrDistributorNumber);
    duplumKeyMap.put("abbreviatedNameOfPublisher", abbreviatedNameOfPublisher);
    duplumKeyMap.put("numberOfPart", numberOfPart);
    duplumKeyMap.put("nameOfPart", nameOfPart);
    duplumKeyMap.put("extent", extent);
    duplumKeyMap.put("musicalPresentationStatement", musicalPresentationStatement);
    duplumKeyMap.put("volumeDesignation", volumeDesignation);
    duplumKeyMap.put("relatedParts", relatedParts);
    duplumKeyMap.put("systemControlNumbers", systemControlNumbers);
    for (Map.Entry entry : oclcMap.entrySet()) {
      duplumKeyMap.put(entry.getKey(), entry.getValue());
    }
  }

  public boolean isValid() {
    return valid;
  }

  private void processSystemControlNumbers() {
    systemControlNumbers = new ArrayList<>();
    if (resultMap.get("035$a") != null) {
      for (String original : resultMap.get("035$a")) {
        systemControlNumbers.add(new X035aSystemControlNumber(original));
      }
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy