
de.gwdg.metadataqa.marc.analysis.ThompsonTraillAnalysis Maven / Gradle / Ivy
package de.gwdg.metadataqa.marc.analysis;
import de.gwdg.metadataqa.marc.dao.Control008;
import de.gwdg.metadataqa.marc.dao.DataField;
import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
import de.gwdg.metadataqa.marc.MarcSubfield;
import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
import de.gwdg.metadataqa.marc.definition.general.codelist.CountryCodes;
import de.gwdg.metadataqa.marc.definition.general.codelist.LanguageCodes;
import java.util.*;
import java.util.logging.Logger;
import java.util.regex.Pattern;
/**
* Implementation of the scoring algorithm described in
* Leveraging Python to improve ebook metadata selection, ingest, and management
* by Kelly Thompson and Stacie Traill
* Code4Lib Journal, Issue 38, 2017-10-18
* http://journal.code4lib.org/articles/12828
*/
public class ThompsonTraillAnalysis {
private static final Logger logger = Logger.getLogger(ThompsonTraillAnalysis.class.getCanonicalName());
private static final Pattern datePattern = Pattern.compile(
"^(14[5-9]\\d|1[5-9]\\d\\d|200\\d|201[0-7])$"
);
private static List headers = new LinkedList<>();
static {
for (ThompsonTraillFields field : ThompsonTraillFields.values()) {
headers.add(field.getMachine());
}
}
private ThompsonTraillAnalysis() {
throw new IllegalStateException("This is a utility class");
}
public static List getHeader() {
return headers;
}
public static List getScores(BibliographicRecord marcRecord) {
var ttScores = new ThompsonTraillScores();
if (marcRecord.getSchemaType().equals(SchemaType.MARC21)) {
// countFields
ttScores.set(ThompsonTraillFields.ISBN, countFields(marcRecord, Arrays.asList("020")));
ttScores.set(ThompsonTraillFields.AUTHORS, countFields(marcRecord, Arrays.asList("100", "110", "111")));
ttScores.set(ThompsonTraillFields.ALTERNATIVE_TITLES, countFields(marcRecord, Arrays.asList("246")));
ttScores.set(ThompsonTraillFields.EDITION, countFields(marcRecord, Arrays.asList("250")));
ttScores.set(ThompsonTraillFields.CONTRIBUTORS,
countFields(marcRecord, Arrays.asList("700", "710", "711", "720")));
ttScores.set(ThompsonTraillFields.SERIES,
countFields(marcRecord, Arrays.asList("440", "490", "800", "810", "830")));
// calculateTocAndAbstract
ttScores.set(ThompsonTraillFields.TOC, calculateTocAndAbstract(marcRecord));
var control008 = marcRecord.getControl008();
String date008 = extractDate008(control008);
ttScores.set(ThompsonTraillFields.DATE_008, calculateDate008(date008));
ttScores.set(ThompsonTraillFields.DATE_26X, calculateDate26x(marcRecord, date008));
ttScores.set(ThompsonTraillFields.LC_NLM, calculateClassificationLcNlm(marcRecord));
calculateClassifications(marcRecord, ttScores);
// calculateIsOnlineResource
ttScores.set(ThompsonTraillFields.ONLINE, calculateIsOnlineResource(marcRecord, control008));
ttScores.set(ThompsonTraillFields.LANGUAGE_OF_RESOURCE, calculateLanguageOfResource(control008));
ttScores.set(ThompsonTraillFields.COUNTRY_OF_PUBLICATION, calculateCountryOfPublication(control008));
calculateLanguageAndRda(marcRecord, ttScores);
} else if (marcRecord.getSchemaType().equals(SchemaType.PICA)) {
for (Map.Entry> entry : marcRecord.getThompsonTraillTagsMap().entrySet())
ttScores.set(entry.getKey(), countFields(marcRecord, entry.getValue()));
}
ttScores.calculateTotal();
return ttScores.asList();
}
// Language of Cataloging 040$b 1 point if either no language is specified,
// or if English is specified
// Descriptive cataloging standard 040$e 1 point if value is “rda”
private static void calculateLanguageAndRda(BibliographicRecord marcRecord,
ThompsonTraillScores ttScores) {
List fields040 = marcRecord.getDatafield("040");
var noLanguageOrEnglish = false;
var isRDA = false;
if (fields040 != null && !fields040.isEmpty()) {
for (DataField language : fields040) {
List subfields = language.getSubfield("b");
if (subfields != null && !subfields.isEmpty()) {
for (MarcSubfield subfield : subfields) {
if (!noLanguageOrEnglish && subfield.getValue().equals("eng"))
noLanguageOrEnglish = true;
}
}
subfields = language.getSubfield("e");
if (subfields != null && !subfields.isEmpty())
for (MarcSubfield subfield : subfields)
if (!isRDA && subfield.getValue().equals("rda"))
isRDA = true;
}
}
ttScores.set(ThompsonTraillFields.LANGUAGE_OF_CATALOGING, (noLanguageOrEnglish ? 1 : 0));
ttScores.set(ThompsonTraillFields.RDA, (isRDA ? 1 : 0));
}
// LC/NLM Classification 050, 060, 090 1 point if any field exists
private static int calculateClassificationLcNlm(BibliographicRecord marcRecord) {
return (
exists(marcRecord, "050") ||
exists(marcRecord, "060") ||
exists(marcRecord, "090")) ? 1 : 0;
}
// Date (MARC 26X)
// 260$c or 264$c
// 1 point if 4-digit date exists; 1 point if matches 008 date.
private static int calculateDate26x(BibliographicRecord marcRecord, String date008) {
var score = 0;
if (exists(marcRecord, "260")) {
List fields = marcRecord.getDatafield("260");
for (DataField field : fields) {
List subfields = field.getSubfield("c");
if (subfields != null && !subfields.isEmpty()) {
for (MarcSubfield subfield : subfields) {
if (score == 0)
score = 1;
if (score < 2
&& !date008.equals("")
&& subfield.getValue().contains(date008))
score = 2;
}
}
}
}
if (exists(marcRecord, "264")) {
List fields = marcRecord.getDatafield("264");
for (DataField field : fields) {
List subfields = field.getSubfield("c");
if (subfields != null && !subfields.isEmpty()) {
for (MarcSubfield subfield : subfields) {
if (score == 0)
score = 1;
if (score < 2 && !date008.equals("") && subfield.getValue().contains(date008))
score = 2;
}
}
}
}
return score;
}
private static String extractDate008(Control008 control008) {
// Date (MARC 008) 008/7-10 1 point if valid coded date exists
var date008 = "";
if (control008 != null
&& control008.getTag008all07() != null) {
date008 = control008.getTag008all07().getValue();
}
return date008;
}
private static int calculateDate008(String date008) {
// Date (MARC 008) 008/7-10 1 point if valid coded date exists
return datePattern.matcher(date008).matches() ? 1 : 0;
}
// Table of Contents and Abstract
// 505, 520 2 points if both fields exist; 1 point if either field exists
private static int calculateTocAndAbstract(BibliographicRecord marcRecord) {
var score = 0;
score += exists(marcRecord, "505") ? 1 : 0;
score += exists(marcRecord, "520") ? 1 : 0;
return score;
}
private static void calculateClassifications(BibliographicRecord marcRecord,
ThompsonTraillScores ttScores) {
// 600 - Personal Name
// 610 - Corporate Name
// 611 - Meeting Name
// 630 - Uniform Title
// 650 - Topical Term
// 651 - Geographic Name
// 653 - Uncontrolled Index Term
// Subject Headings: Library of Congress
// 600, 610, 611, 630, 650, 651 second indicator 0
// 1 point for each field up to 10 total points
// Subject Headings: MeSH 600, 610, 611, 630, 650, 651 second indicator 2 1 point for each field up to 10 total points
// Subject Headings: FAST 600, 610, 611, 630, 650, 651 second indicator 7, $2 fast 1 point for each field up to 10 total points
// Subject Headings: Other 600, 610, 611, 630, 650, 651, 653 if above criteria are not met 1 point for each field up to 5 total points
for (String tag : Arrays.asList("600", "610", "611", "630", "650", "651", "653")) {
if (exists(marcRecord, tag)) {
List fields = marcRecord.getDatafield(tag);
for (DataField field : fields) {
if (field.getInd2().equals("0"))
ttScores.count(ThompsonTraillFields.LC_NLM);
else if (field.getInd2().equals("2"))
ttScores.count(ThompsonTraillFields.MESH);
else if (field.getInd2().equals("7")) {
List subfield2 = field.getSubfield("2");
if (subfield2 == null) {
logger.severe(String.format(
"Error in %s: ind2 = 7, but there is no $2",
marcRecord.getControl001().getContent()));
} else
switch (field.getSubfield("2").get(0).getValue()) {
case "fast": ttScores.count(ThompsonTraillFields.FAST); break;
case "gnd": ttScores.count(ThompsonTraillFields.GND); break;
default: ttScores.count(ThompsonTraillFields.OTHER); break;
}
}
else {
ttScores.count(ThompsonTraillFields.OTHER);
}
}
}
}
}
private static int calculateIsOnlineResource(BibliographicRecord marcRecord, Control008 control008) {
var score008 = calculateIsOnlineFrom008(marcRecord, control008);
var score300a = calculateIsOnlineFrom300a(marcRecord);
return score008 + score300a;
}
private static int calculateIsOnlineFrom300a(BibliographicRecord marcRecord) {
List fields300 = marcRecord.getDatafield("300");
var isOnlineResource = false;
if (fields300 != null && !fields300.isEmpty()) {
for (DataField field : fields300) {
List subfields = field.getSubfield("a");
if (subfields != null && !subfields.isEmpty())
for (MarcSubfield subfield : subfields)
if (!isOnlineResource && subfield.getValue().equals("online resource"))
isOnlineResource = true;
}
}
return isOnlineResource ? 1 : 0;
}
private static int calculateIsOnlineFrom008(BibliographicRecord marcRecord, Control008 control008) {
// Description 008/23=o and 300$a “online resource” 2 points if both elements exist; 1 point if either exists
String formOfItem = null;
if (control008 != null) {
switch (marcRecord.getType()) {
case BOOKS:
if (control008.getTag008book23() != null)
formOfItem = control008.getTag008book23().getValue();
break;
case COMPUTER_FILES:
if (control008.getTag008computer23() != null)
formOfItem = control008.getTag008computer23().getValue();
break;
case CONTINUING_RESOURCES:
if (control008.getTag008continuing23() != null)
formOfItem = control008.getTag008continuing23().getValue();
break;
case MAPS:
if (control008.getTag008map29() != null)
formOfItem = control008.getTag008map29().getValue();
break;
case MIXED_MATERIALS:
if (control008.getTag008mixed23() != null)
formOfItem = control008.getTag008mixed23().getValue();
break;
case MUSIC:
if (control008.getTag008music23() != null)
formOfItem = control008.getTag008music23().getValue();
break;
case VISUAL_MATERIALS:
if (control008.getTag008visual29() != null)
formOfItem = control008.getTag008visual29().getValue();
break;
}
}
return (formOfItem != null && formOfItem.equals("o")) ? 1 : 0;
}
// Language of Resource 008/35-37 1 point if likely language code exists
private static int calculateLanguageOfResource(Control008 control008) {
int score;
if (control008 != null && control008.getTag008all35() != null)
score = LanguageCodes.getInstance().isValid(control008.getTag008all35().getValue()) ? 1 : 0;
else
score = 0;
return score;
}
// Country of Publication Code 008/15-17 1 point if likely country code exists
private static int calculateCountryOfPublication(Control008 control008) {
int score;
if (control008 != null && control008.getTag008all15() != null)
score = CountryCodes.getInstance().isValid(control008.getTag008all15().getValue()) ? 1 : 0;
else
score = 0;
return score;
}
private static Integer getTotal(List scores) {
var total = 0;
for (Integer score : scores) {
total += score;
}
return total;
}
private static boolean exists(BibliographicRecord marcRecord, String tag) {
List fields = marcRecord.getDatafield(tag);
return (fields != null && !fields.isEmpty());
}
private static int countFields(BibliographicRecord marcRecord, List tags) {
var counter = 0;
for (String tag : tags) {
if (exists(marcRecord, tag))
counter += marcRecord.getDatafield(tag).size();
}
return counter;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy