org.metafacture.biblio.pica.PicaDecoder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of metafacture-biblio Show documentation
Modules for processing bibliographic data
There is a newer version: 6.2.0
/*
 * Copyright 2016, 2019 Christoph Böhme and others
 *
 * Licensed under the Apache License, Version 2.0 the "License";
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.metafacture.biblio.pica;

import org.metafacture.commons.StringUtil;
import org.metafacture.framework.FluxCommand;
import org.metafacture.framework.MissingIdException;
import org.metafacture.framework.StreamReceiver;
import org.metafacture.framework.annotations.Description;
import org.metafacture.framework.annotations.In;
import org.metafacture.framework.annotations.Out;
import org.metafacture.framework.helpers.DefaultObjectPipe;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Parses pica+ records. The parser only parses single records. A string
 * containing multiple records must be split into individual records before
 * passing it to {@code PicaDecoder}.
 * 
 * The parser is designed to accept any string as valid input and to parse
 * pica+ in its two serialization forms:
 * as non-normalized and as normalized.
 * To achieve this, the parser behaves as following when parsing:
 * normalized pica+:
 * 

 *   The parser assumes that the input starts with a field name.
 *
 *   
The field name and the first subfield are separated by a subfield
 *   marker (\u001f).
 *
 *   
Fields are separated by record markers (\u001d), field
 *   markers (\u001e) or field end markers (\u000a).
 *
 *   
Subfields are separated by subfield markers (\u001f).
 *
 *   
The first character of a subfield is the name of the subfield
 *
 *   
The parser assumes that the end of the input marks the end of the
 *   current field and the end of the record.
 *
 *   
To handle input with multiple field and subfield separators following
 *   each other directly (for instance \u000a and \u001e), it is assumed
 *   that field names, subfields, subfield names or subfield values can be
 *   empty.
 * 
 *  * non-normalized pica+:
 * 
 *   The parser assumes that the input starts with a field name.
 *
 *   
The field name and the first subfield are separated by a subfield
 *   marker ($).
 *
 *   
Fields are separated by record markers (\n) or field end
 *   markers (\n).
 *
 *   
Subfields are separated by subfield markers ($).
 *
 *   
The first character of a subfield is the name of the subfield
 *
 *   
The parser assumes that the end of the input marks the end of the
 *   current field and the end of the record.
 *
 *   
As multiple fields and subfields are not empty in non-normailzed pica+
 *   they are just treated like anything else.
 * 
 * Please note that the record marker is treated as a field delimiter and not
 * as a record delimiter. Records need to be separated prior to parsing them.
 * 
 * As the behaviour of the parser may result in unnamed fields or subfields or
 * fields with no subfields the {@code PicaDecoder} automatically filters empty
 * fields and subfields:
 * 

 *   Subfields without a name are ignored (such subfields cannot have any
 *   value because then the first character of the value would be the name of
 *   the subfield).
 *
 *   
Subfields which only have a name but no value are always parsed.
 *
 *   
In normalized pica+ unnamed fields are only parsed if they contain
 *    not-ignored subfields. In Non-normalized pica+ unnamed fields don't exist.
 *
 *   
Named fields containing none or only ignored subfields are only parsed
 *   if {@link #setSkipEmptyFields(boolean)} is set to false otherwise they are
 *   ignored.
 *
 *   
Input containing only whitespace (spaces and tabs) is completely
 *   ignored.
 * 
 * The {@code PicaDecoder} emits start-entity and end-entity
 * events for each parsed field and literal events for each parsed
 * subfield. Field names are trimmed by default (leading and trailing whitespace
 * is removed). This can be changed by setting
 * {@link #setTrimFieldNames(boolean)} to false.
 * 
 * The record ID emitted with the start-record event is extracted from
 * one of the following non-normalized pica+ fields:
 * 

 *   003@ $0
 *   
107F $0
 *   
203@ $0 (this field may have an optional occurrence marker)
 * 
 * The value of the first matching field is used as the record ID. The $0
 * subfield must be the first subfield in the field. If
 * {@link #setIgnoreMissingIdn(boolean)} is false and no matching field is not
 * found in the record a {@link MissingIdException} is thrown otherwise the
 * record identifier is an empty string.
 * 
 * For example, when run on this input in its normalized serialization form:
 * 
 * 003@ \u001f01234\u001e
 * 028A \u001faAndy\u001fdWarhol\u001e
 * 
 *
 * the {@code PicaDecoder} will produce the following sequence of events:
 * {@literal
 * start-record "1234"
 * start-entity "003@"
 * literal "0": 1234
 * end-entity
 * start-entity "028A"
 * literal "a": Andy
 * literal "d": Warhol
 * end-entity
 * end-record
 * }
 *
 * The parser assumes that the input is utf-8 encoded. The parser does not
 * support other pica encodings.
 *
 * @author Christoph Böhme
 * @author Pascal Christoph (dr0i) (add support for non-normalized pica+)
 * @author Fabian Steeg (fsteeg) (switch to enum)
 *
 */
@Description("Parses pica+ records. The parser only parses single records. " +
        "A string containing multiple records must be split into " +
        "individual records before passing it to PicaDecoder.")
@In(String.class)
@Out(StreamReceiver.class)
@FluxCommand("decode-pica")
public final class PicaDecoder extends DefaultObjectPipe {

    private static final int BUFFER_SIZE = 1024 * 1024;

    private Matcher idFieldMatcher;
    private final StringBuilder idBuilder = new StringBuilder();
    private final PicaParserContext parserContext = new PicaParserContext();

    private char[] buffer = new char[BUFFER_SIZE];
    private int recordLen;

    private boolean ignoreMissingIdn;
    private boolean isNormalized;

    /**
     * Creates an instance of {@link PicaDecoder}. Sets the input to read as
     * normalized pica+.
     */
    public PicaDecoder() {
        this(true);
    }

    /**
     * Creates an instance of {@link PicaDecoder}. Sets the input to read as
     * normalized or non-normalized pica+.
     *
     * @param normalized true if input is read as normalized pica+, otherwiese false
     */
    public PicaDecoder(final boolean normalized) {
        setNormalizedSerialization(normalized);
    }

    /**
     * Controls whether the input is read as normalized or non-normalized
     * pica+. As the default "normalized" is assumed.
     *
     * @param normalized if true, the input is treated as normalized pica+ ;
     *                   if false, it's treated as non-normalized.
     */
    public void setNormalizedSerialization(final boolean normalized) {
        this.isNormalized = normalized;

        final String startMarkers = "(?:^|" + PicaConstants.FIELD_MARKER.get(isNormalized) + "|" +
                PicaConstants.FIELD_END_MARKER.get(isNormalized) + "|" +
                PicaConstants.RECORD_MARKER.get(isNormalized) + "|.*\n" + ")";
        final Pattern idFieldsPattern = Pattern
                .compile(startMarkers + "(?:003@|203@(?:/..+)?|107F) " +
                        " ?(\\" + PicaConstants.SUBFIELD_MARKER.get(isNormalized) + "|" +
                        PicaConstants.SUBFIELD_MARKER.get(isNormalized) + ")0");
        idFieldMatcher = idFieldsPattern.matcher("");
    }

    /**
     * Controls whether records having no record ID are reported as faulty. By
     * default such records are reported by the {@code PicaDecoder} by throwing
     * a {@link MissingIdException}.
     * 
     * The setting can be changed at any time. It becomes effective with the next
     * record that is being processed.
     * 

     * Default value: {@code false}
     *
     * @param ignoreMissingIdn if true, missing record IDs do not trigger a
     *                         {@link MissingIdException} but an empty string is
     *                         used as record identifier instead.
     */
    public void setIgnoreMissingIdn(final boolean ignoreMissingIdn) {
        this.ignoreMissingIdn = ignoreMissingIdn;
    }

    /**
     * Gets the flag to decide whether records without a record ID are processed.
     *
     * @return true if the ID of a record can be absent, otherwise false
     */
    public boolean getIgnoreMissingIdn() {
        return ignoreMissingIdn;
    }

    /**
     * Controls whether decomposed Unicode characters in field values are
     * normalised to their precomposed version. By default no normalisation is
     * applied. The normalisation is only applied to values not to field or
     * subfield names.
     * 

     * The setting can be changed at any time. It becomes effective with the next
     * record that is being processed.
     * 

     * Default value: {@code false}
     *
     * @param normalizeUTF8 if true, decomposed Unicode characters in values are
     *                      normalised to their precomposed version.
     */
    public void setNormalizeUTF8(final boolean normalizeUTF8) {
        parserContext.setNormalizeUTF8(normalizeUTF8);
    }

    /**
     * Gets the flag to decide whether the record is UTF-8 normalized.
     *
     * @return true if the record is UTF-8 normalized, otherwise false
     */
    public boolean getNormalizeUTF8() {
        return parserContext.getNormalizeUTF8();
    }

    /**
     * Controls whether fields without subfields are skipped and no events are
     * emitted for them. By default empty fields are skipped.
     * 

     * The setting can be changed at any time. It becomes effective with the next
     * record that is being processed.
     * 

     * Default value: {@code true}
     *
     * @param skipEmptyFields if true, then empty fields are skipped.
     */
    public void setSkipEmptyFields(final boolean skipEmptyFields) {
        parserContext.setSkipEmptyFields(skipEmptyFields);
    }

    /**
     * Gets the flag to decide whether to skip empty fields.
     *
     * @return true if empty fields are ignored, otherwise false
     */
    public boolean getSkipEmptyFields() {
        return parserContext.getSkipEmptyFields();
    }

    /**
     * Sets whether field names are trimmed (removal of leading and trailing
     * whitespace). By default field names are trimmed.
     * 

     * The setting can be changed at any time. It becomes effective with the next
     * record that is being processed.
     * 
     * Default value: {@code true}
     *
     * @param trimFieldNames if true, then field names are trimmed.
     */
    public void setTrimFieldNames(final boolean trimFieldNames) {
        parserContext.setTrimFieldNames(trimFieldNames);
    }

    /**
     * Gets the flag to decide whether the field names are trimmed.
     *
     * @return true if the field names are trimmed, otherwise false
     */
    public boolean getTrimFieldNames() {
        return parserContext.getTrimFieldNames();
    }

    @Override
    public void process(final String record) {
        assert !isClosed();

        buffer = StringUtil.copyToBuffer(record, buffer);
        recordLen = record.length();

        if (isRecordEmpty()) {
            return;
        }

        String id = extractRecordId();
        if (id == null) {
            if (!ignoreMissingIdn) {
                throw new MissingIdException("Record has no id");
            }
            id = "";
        }
        getReceiver().startRecord(id);

        PicaParserState state = PicaParserState.FIELD_NAME;
        for (int i = 0; i < recordLen; ++i) {
            state = state.parseChar(buffer[i], parserContext, isNormalized);
        }
        state.endOfInput(parserContext);

        getReceiver().endRecord();
    }

    @Override
    protected void onSetReceiver() {
        parserContext.setReceiver(getReceiver());
    }

    @Override
    protected void onResetStream() {
        parserContext.reset();
    }

    private boolean isRecordEmpty() {
        for (int i = 0; i < recordLen; ++i) {
            if (buffer[i] != ' ' && buffer[i] != '\t') {
                return false;
            }
        }
        return true;
    }

    private String extractRecordId() {
        final int idFromIndex = findRecordId();
        if (idFromIndex == -1) {
            return null;
        }
        idBuilder.setLength(0);
        for (int i = idFromIndex; i < recordLen; ++i) {
            final char ch = buffer[i];
            if (isMarker(ch)) {
                break;
            }
            idBuilder.append(ch);
        }
        return idBuilder.toString();
    }

    private int findRecordId() {
        idFieldMatcher.reset(new String(buffer, 0, recordLen));
        if (!idFieldMatcher.find()) {
            return -1;
        }
        return idFieldMatcher.end();
    }

    private boolean isMarker(final char ch) {
        return PicaConstants.from(isNormalized, ch) != PicaConstants.NO_MARKER;
    }

}