All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.bl.wa.solr.SolrRecord Maven / Gradle / Ivy

There is a newer version: 3.3.0
Show newest version
/**
 * 
 */
package uk.bl.wa.solr;

/*
 * #%L
 * warc-indexer
 * $Id:$
 * $HeadURL:$
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * .
 * #L%
 */

import java.io.IOException;
import java.io.Serializable;
import java.io.Writer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.archive.io.ArchiveRecordHeader;

import uk.bl.wa.util.Instrument;
import uk.bl.wa.util.Normalisation;

/**
 * @author Andrew Jackson 
 *
 */
public class SolrRecord implements Serializable {

    // private static Log log = LogFactory.getLog(SolrRecord.class);

    private static final long serialVersionUID = -4556484652176976472L;
    
    private SolrInputDocument doc = new SolrInputDocument();

    private final int defaultMax;
    private final HashMap maxLengths; // Explicit HashMap as it is Serializable

    public SolrRecord(int defaultMaxFieldLength, HashMap maxFieldLengths) {
        this.defaultMax = defaultMaxFieldLength;
        this.maxLengths = maxFieldLengths;
    }

    /**
     * @deprecated use {@link SolrRecordFactory#createRecord()} instead.
     */
    @Deprecated
    public SolrRecord() {
        this(SolrRecordFactory.DEFAULT_MAX_LENGTH, new HashMap());
    }

    public String toXml() {
        return ClientUtils.toXML( doc );
    }

    /**
     * Write the SolrDocument to the provided writer, sans XML-header.
     * Intended for creating batches of documents.
     */
    public void writeXml(Writer writer) throws IOException {
        ClientUtils.writeXML( doc, writer );
    }

    private static final int MAX_FIELD_LEN = 4096;
    
    public SolrRecord(int defaultMaxFieldLength, HashMap maxFieldLengths,
                      String filename, ArchiveRecordHeader header) {
        defaultMax = defaultMaxFieldLength;
        maxLengths = maxFieldLengths;
        setField(SolrFields.ID,
                "exception-at-" + filename + "@" + header.getOffset());
        setField(SolrFields.SOURCE_FILE, filename);
        setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset());
        setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl()));
        setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN);
    }

    /**
     * @deprecated use {@link SolrRecordFactory#createRecord(String, ArchiveRecordHeader)} instead.
     */
    public SolrRecord(String filename, ArchiveRecordHeader header) {
        this(SolrRecordFactory.DEFAULT_MAX_LENGTH, new HashMap(), filename, header);
    }

    /**
     * Remove control characters, nulls etc,
     * 
     * @param value
     * @return
     */
    private String removeControlCharacters( String value ) {
        final long start = System.nanoTime();
        try {
            // Avoid re-compiling the regexps in each call (just a small speed-up, but a simple one)
            return CNTRL_PATTERN.matcher(
                    SPACE_PATTERN.matcher(sanitiseUTF8(value.trim())).replaceAll(" ")
            ).replaceAll("");
//            return sanitiseUTF8(value.trim().replaceAll("\\p{Space}", " ")
//                             .replaceAll("\\p{Cntrl}", ""));
        } catch (CharacterCodingException e) {
            return "";
        } finally {
            Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#solrdocCreation",
                               "SolrRecord.removeControlCharacters#total", start);
        }
    }
    private static final Pattern SPACE_PATTERN = Pattern.compile("\\p{Space}");
    private static final Pattern CNTRL_PATTERN = Pattern.compile("\\p{Cntrl}");

    /**
     * Aim to prevent "Invalid UTF-8 character 0xfffe" slipping into the text
     * payload.
     * 
     * The encodes and decodes a String that may not be UTF-8 compliant as
     * UTF-8. Any dodgy characters are replaced.
     * 
     * @param value
     * @return
     * @throws CharacterCodingException
     */
    // It would be nice to re-use the encoder & decoder, but they are not Thread-safe
    private CharSequence sanitiseUTF8(String value) throws CharacterCodingException {
        final long start = System.nanoTime();
        try  {
            // Take a string, map it to bytes as UTF-8:
            CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder();
            encoder.onMalformedInput(CodingErrorAction.REPLACE);
            encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
            ByteBuffer bytes = encoder.encode(CharBuffer.wrap(value));
            // Now decode back again:
            CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
            decoder.onMalformedInput(CodingErrorAction.REPLACE);
            decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
            // No need to do toString as the CharBuffer can be used directly by the matcher in removeControlCharacters
            return decoder.decode(bytes);
        } finally {
            Instrument.timeRel("SolrRecord.removeControlCharacters#total", "SolrRecord.sanitiseUTF8", start);
        }
    }


    /**
     * Also shorten to avoid bad data filling 'small' fields with 'big' data.
     * 
     * @param value
     * @param truncateToLength to truncate to (-1 means don't truncate)
     * @return
     */
    private String sanitizeString(String value, int truncateToLength) {
        if (value == null) {
            return null;
        }
        if (truncateToLength > 0) {
            if (value.length() > truncateToLength) {
                value = value.substring(0, truncateToLength);
            }
        }
        return removeControlCharacters(value);
    }

    /**
     * Add any non-null string properties, stripping control characters if present.
     * 
     * @param solr_property
     * @param value
     */
    public void addField(String solr_property, String value) {
        addFieldTruncated(solr_property, value, getMaxLength(solr_property));
    }

    private int getMaxLength(String solrField) {
        Integer max = maxLengths.get(solrField);
        return max == null ? defaultMax : max;
    }

    /**
     * Add the field, truncating the value if it's larger than the given limit.
     * 
     * @param solr_property
     * @param value
     * @param truncateTo
     */
    public void addFieldTruncated(String solr_property, String value, int truncateTo) {
        value = sanitizeString(value, truncateTo);
        if (value != null && !value.isEmpty())
            doc.addField( solr_property, value );
    }

    /**
     * Set instead of adding fields.
     * 
     * @param solr_property
     * @param value
     */
    public void setField(String solr_property, String value) {
        setFieldTruncated(solr_property, value, getMaxLength(solr_property));
    }

    /**
     * Set the field, truncating the value if it's larger than the given limit.
     * 
     * @param solr_property
     * @param value
     * @param truncateTo
     */
    public void setFieldTruncated(String solr_property, String value, int truncateTo) {
        value = sanitizeString(value, truncateTo);
        if (value != null && !value.isEmpty())
            doc.setField(solr_property, value);
    }
    
    /**
     * Like add, but also allows these values to merge with those in the index already.
     * 
     * @param solr_property
     * @param value
     */
    public void mergeField( String solr_property, String value ) {
        if (value == null || value.isEmpty()) {
            return;
        }
        Map operation = new HashMap();
        operation.put("add", value );
        doc.addField(solr_property, operation);
    }

    /**
     * @param fieldname
     * @return
     */
    public Object getFieldValue(String fieldname) {
        return doc.getFieldValue(fieldname);
    }

    /**
     * @return
     */
    public SolrInputDocument getSolrDocument() {
        return doc;
    }

    /**
     * @param fieldname
     */
    public void removeField(String fieldname) {
        doc.removeField(fieldname);
    }

    /**
     * @param fieldname
     * @return
     */
    public SolrInputField getField(String fieldname) {
        return doc.getField(fieldname);
    }

    /**
     * @param fieldname
     * @return
     */
    public boolean containsKey(String fieldname) {
        return doc.containsKey(fieldname);
    }

    /**
     * @param newdoc
     */
    public void setSolrDocument(SolrInputDocument newdoc) {
        doc = newdoc;
    }

    /**
     * 
     * @param e
     */
    public void addParseException(Throwable e) {
        addField(SolrFields.PARSE_ERROR,
                e.getClass().getName() + ": " + e.getMessage());
    }

    /**
     * 
     * @param hint
     * @param e
     */
    public void addParseException(String hint, Throwable e) {
        addField(SolrFields.PARSE_ERROR, e.getClass().getName() + " " + hint
                + ": " + e.getMessage());
    }

    /**
     * 
     * @return
     */
    public String getUrl() {
        return (String) getField(SolrFields.SOLR_URL).getFirstValue();
    }

    /**
     * 
     * @return
     */
    public String getWaybackDate() {
        return (String) getField(SolrFields.WAYBACK_DATE).getFirstValue();
    }

    /**
     * 
     * @return
     */
    public String getHash() {
        return (String) getField(SolrFields.HASH).getFirstValue();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy