org.metafacture.biblio.pica.PicaMultiscriptRemodeler Maven / Gradle / Ivy
Show all versions of metafacture-biblio Show documentation
/*
* Copyright 2014 Deutsche Nationalbibliothek
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.metafacture.biblio.pica;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import org.metafacture.flowcontrol.StreamBuffer;
import org.metafacture.framework.FluxCommand;
import org.metafacture.framework.StreamReceiver;
import org.metafacture.framework.annotations.Description;
import org.metafacture.framework.annotations.In;
import org.metafacture.framework.annotations.Out;
import org.metafacture.framework.helpers.DefaultStreamPipe;
/**
* Groups multiscript fields in entities.
*
* In Pica records some fields can be repeated if they contain contents with a
* non-latin script. These fields can be recognised by the existence of the
* subfields {@code $U} and {@code $T}. {@code $U} contains the name of the
* script used for the values of the fields. {@code $T} contains a number which
* is used to group fields together which contain the same contents in different
* scripts. All fields having the same field name and the same number in
* {@code $T} are considered to belong together.
*
* This module scans the input stream for Pica multiscript fields and remodels
* them by merging all fields belong to the same multiscript group into one
* entity. This entity has the original field name. Within this entity a new
* entity for each of the original fields is created. These entities are named
* depending on the type of script used. Three scripts are distinguished:
*
* - Latin
*
- NonLatinLR
*
- NonLatinRL
*
*
* The following example shows how the input
*
*
* 021A $T 01 $U Latn $a Title
* 021A $T 01 $U Grek $a Greek title
* 021C $T 01 $U Latn $a Subseries A
* 021C $T 02 $U Latn $a Subseries B
* 021C $T 01 $U Grek $a Greek subseries A
* 021C $T 02 $U Grek $a Greek subseries B
*
*
* is remodeled into
*
*
* 021A {
* Latin { T: 01, U: Latn, a: Title }
* NonLatinLR { T: 01, U: Grek, a: Greek title }
* }
* 021C {
* Latin { T: 01, U: Latn, a: Subseries A }
* NonLatinLR { T: 01, U: Grek, a: Greek subseries A }
* }
* 021C {
* Latin { T: 02, U: Latn, a: Subseries B }
* NonLatinLR { T: 02, U: Grek, a: Greek subseries B}
* }
*
*
* Fields which do not contain subfields $U and $T are passed through the module
* unaffected. If a multiscript field is encountered which only exists in a
* single script it is not remodeled but simply passed through. The module
* assumes that no more than two script-variants of a field exist. If a field
* with more than two variants is encountered then the behaviour of
* {@code PicaMultiscriptRemodeler} is undefined.
*
* The order of the output is determined by the order of the second occurrences
* of the multiscript fields. Multiscript fields without a second occurrences are
* output when the second occurrence of a field with a greater group number is
* encountered.
*
* If a field contains only $U or $T but not both, the field is simply passed
* through.
*
* If the sequence of input events does not follow the Pica record definitions
* (order of fields, nesting of entities) the behaviour of this module is
* undefined.
*
* @author Christoph Böhme
*
*/
@In(StreamReceiver.class)
@Out(StreamReceiver.class)
@Description("Groups multiscript fields in entities")
@FluxCommand("remodel-pica-multiscript")
public final class PicaMultiscriptRemodeler extends
DefaultStreamPipe {
public static final String ENTITY_NAME_FOR_LATIN = "Latin";
public static final String ENTITY_NAME_FOR_NON_LATIN_LR = "NonLatinLR";
public static final String ENTITY_NAME_FOR_NON_LATIN_RL = "NonLatinRL";
private static final BufferedField BEFORE_FIRST_FIELD = new BufferedField("", null);
private static final String GROUP_SUBFIELD = "T";
private static final String SCRIPT_SUBFIELD = "U";
private static final String LATIN_SCRIPT = "Latn";
private static final String ARABIC_SCRIPT = "Arab";
private static final String HEBREW_SCRIPT = "Hebr";
private BufferedField currentField;
private BufferedField lastField;
private final SortedMap bufferedFields = new TreeMap();
@Override
public void startRecord(final String identifier) {
getReceiver().startRecord(identifier);
currentField = null;
lastField = BEFORE_FIRST_FIELD;
bufferedFields.clear();
}
@Override
public void endRecord() {
emitAsSingleMultiscriptFields(bufferedFields);
getReceiver().endRecord();
}
@Override
public void startEntity(final String name) {
currentField = new BufferedField(name);
currentField.stream.setReceiver(getReceiver());
if (!lastField.name.equals(currentField.name)) {
emitAsSingleMultiscriptFields(bufferedFields);
}
}
@Override
public void endEntity() {
if (currentField.group == null || currentField.script == null) {
emitNonMultiscriptField();
} else {
if (bufferedFields.containsKey(currentField.group)) {
emitAsSingleMultiscriptFields(getSingleMultiscriptFieldsBeforeCurrentField());
emitRemodeledMultiscriptField(bufferedFields.remove(currentField.group), currentField);
} else {
bufferMultiscriptField(currentField);
}
}
lastField = currentField;
currentField = null;
}
@Override
public void literal(final String name, final String value) {
currentField.stream.literal(name, value);
if (GROUP_SUBFIELD.equals(name)) {
currentField.group = value;
} else if (SCRIPT_SUBFIELD.equals(name)) {
currentField.script = value;
}
}
private void bufferMultiscriptField(final BufferedField field) {
bufferedFields.put(field.group, field);
}
private Map getSingleMultiscriptFieldsBeforeCurrentField() {
return bufferedFields.headMap(currentField.group);
}
private void emitNonMultiscriptField() {
getReceiver().startEntity(currentField.name);
currentField.stream.replay();
getReceiver().endEntity();
}
private void emitRemodeledMultiscriptField(final BufferedField firstField, final BufferedField secondField) {
getReceiver().startEntity(firstField.name);
getReceiver().startEntity(mapScriptToEntityName(firstField.script));
firstField.stream.replay();
getReceiver().endEntity();
getReceiver().startEntity(mapScriptToEntityName(secondField.script));
secondField.stream.replay();
getReceiver().endEntity();
getReceiver().endEntity();
}
private void emitAsSingleMultiscriptFields(final Map fields) {
for (final BufferedField field : fields.values()) {
getReceiver().startEntity(field.name);
field.stream.replay();
getReceiver().endEntity();
}
fields.clear();
}
private String mapScriptToEntityName(final String script) {
if (LATIN_SCRIPT.equals(script)) {
return ENTITY_NAME_FOR_LATIN;
} else if (ARABIC_SCRIPT.equals(script)
|| HEBREW_SCRIPT.equals(script)) {
return ENTITY_NAME_FOR_NON_LATIN_RL;
}
return ENTITY_NAME_FOR_NON_LATIN_LR;
}
private static class BufferedField {
public String group;
public String script;
public final String name;
public final StreamBuffer stream;
public BufferedField(final String name) {
this(name, new StreamBuffer());
}
public BufferedField(final String name, final StreamBuffer stream) {
this.group = null;
this.script = null;
this.name = name;
this.stream = stream;
}
}
}