All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.bl.wa.analyser.payload.ARCNameAnalyser Maven / Gradle / Ivy

There is a newer version: 3.3.0
Show newest version
package uk.bl.wa.analyser.payload;

/*
 * #%L
 * warc-indexer
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * .
 * #L%
 */

import com.typesafe.config.Config;
import com.typesafe.config.ConfigValue;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.archive.io.ArchiveRecordHeader;
import uk.bl.wa.solr.SolrRecord;
import uk.bl.wa.util.Instrument;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Matches the ARC path for configured patterns and adds extracted parts to the Solr document.
 * 

* Sample use case: Adding batch-run id encoded in the ARC filename.
* Sample rule: pattern : ".*(job[0-9]+)--([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2}).warc" * templates: harvest_job $1 * harvest_year $2 * adds the fields {@code }harvest_job:job87} and {@code harvest_year:2015} if the ARC file is named * {@code whatever/localrun-job87-20150219-133227.warc}. * @author Toke Eskildsen * */ public class ARCNameAnalyser extends AbstractPayloadAnalyser { private static Log log = LogFactory.getLog( ARCNameAnalyser.class ); private final List rules = new ArrayList(); /* "arcname" : { # Order is significant. Processing stops after first match "rules" : [ { "pattern" :"([0-9]+)-([0-9]+)-([0-9]{4})([0-9]{2})([0-9]{2})([0-9]{2})([0-9]{2})([0-9]{2})-([0-9]+)-(sb-prod-har)-([0-9]{1,3}).(statsbiblioteket.dk.warc|statsbiblioteket.dk.arc|arc)", templates : { "arc_type" : "sb", "arc_harvesttime" : "$3-$4-$5T$6:$7:$8.000Z" } } ] } */ public ARCNameAnalyser(Config conf) { if (!conf.hasPath("warc.index.extract.content.arcname.rules")) { log.debug("No rules for ARCNameAnalyzer; no processing of ARC names"); return; } for (Config ruleConf: conf.getConfigList("warc.index.extract.content.arcname.rules")) { /* { "pattern" :"([0-9]+)-([0-9]+)-([0-9]{4})([0-9]{2})([0-9]{2})([0-9]{2})([0-9]{2})([0-9]{2})-([0-9]+)-(sb-prod-har)-([0-9]{1,3}).(statsbiblioteket.dk.warc|statsbiblioteket.dk.arc|arc)", templates : { "arc_type" : "sb", "arc_harvesttime" : "$3-$4-$5T$6:$7:$8.000Z" } } */ Pattern pattern = Pattern.compile(ruleConf.getString("pattern")); List fieldTemplates = new ArrayList(); for (Map.Entry entry: ruleConf.getConfig("templates").entrySet()) { // "arc_type" : "sb", fieldTemplates.add(new FieldTemplate(entry.getKey(), entry.getValue().unwrapped().toString())); } rules.add(new Rule(pattern, fieldTemplates)); } log.info("Added " + rules.size() + " ARCName rules"); } @Override public void analyse(ArchiveRecordHeader header, InputStream tikainput, SolrRecord solr) { final String name = header.getReaderIdentifier(); if (name == null || name.isEmpty()) { log.debug("No name present for ARC, skipping analyse"); return; } for (Rule rule: rules) { // Match against all rules if (rule.apply(name, solr)) { break; // Only one rule match } } } public List getRules() { return rules; } public static class Rule { public final Pattern pattern; public final List templates; public Rule(Pattern pattern, List templates) { this.pattern = pattern; this.templates = templates; } /** * Apply the rule to the given name. If it matches, add the content from the templates to solr. * @param name ARC path * @param solr destination for template values. * @return true if the rule was applies, else false. */ public boolean apply(String name, SolrRecord solr) { Matcher matcher = pattern.matcher(name); if (!matcher.matches()) { return false; } // Got a match. Apply all templates for (FieldTemplate ft: templates) { try { solr.addField(ft.field, matcher.replaceAll(ft.template)); } catch (Exception e) { log.warn(String.format( "Unable to apply replaceAll to '%s' with matching pattern '%s' and template '%s:%s': %s", name, pattern.pattern(), ft.field, ft.template, e.getMessage())); } } return true; } } public static class FieldTemplate { public final String field; public final String template; public FieldTemplate(String field, String template) { this.field = field; this.template = template; if (field == null || field.isEmpty()) { throw new IllegalArgumentException("Field must not be empty"); } if (template == null || template.isEmpty()) { throw new IllegalArgumentException("Template must not be empty"); } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy