All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nl.dedicon.pipeline.braille.calabash.impl.MetadataStep Maven / Gradle / Ivy

There is a newer version: 2.0.1
Show newest version
package nl.dedicon.pipeline.braille.calabash.impl;

import com.xmlcalabash.core.XProcException;
import com.xmlcalabash.core.XProcRuntime;
import com.xmlcalabash.core.XProcStep;
import com.xmlcalabash.io.ReadablePipe;
import com.xmlcalabash.io.WritablePipe;
import com.xmlcalabash.library.DefaultStep;
import com.xmlcalabash.runtime.XAtomicStep;
import java.io.StringReader;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.transform.stream.StreamSource;
import static net.sf.saxon.s9api.Axis.CHILD;
import net.sf.saxon.s9api.DocumentBuilder;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.XdmItem;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.s9api.XdmSequenceIterator;
import nl.dedicon.pipeline.braille.model.Book;
import nl.dedicon.pipeline.braille.model.Page;
import nl.dedicon.pipeline.braille.model.Section;
import nl.dedicon.pipeline.braille.model.Volume;
import org.daisy.braille.api.embosser.FileFormat;
import org.daisy.common.xproc.calabash.XProcStepProvider;
import org.daisy.pipeline.braille.common.Provider.util.MemoizingProvider;
import static org.daisy.pipeline.braille.common.Provider.util.dispatch;
import static org.daisy.pipeline.braille.common.Provider.util.memoize;
import org.daisy.pipeline.braille.common.Query;
import static org.daisy.pipeline.braille.common.Query.util.mutableQuery;
import static org.daisy.pipeline.braille.common.Query.util.query;
import org.daisy.pipeline.braille.pef.FileFormatProvider;
import org.osgi.service.component.annotations.Component;
import org.osgi.service.component.annotations.Reference;
import org.osgi.service.component.annotations.ReferenceCardinality;
import org.osgi.service.component.annotations.ReferencePolicy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * XProc step for metadata
 * 
 * @author Paul Rambags
 */
public class MetadataStep extends DefaultStep {

    private static final Logger logger = LoggerFactory.getLogger(MetadataStep.class);

    private static final QName _xquery = new QName("xquery");
    private static final QName _identifier = new QName("identifier");
    private static final QName _brf_file_extension = new QName("brf-file-extension");
    private static final QName _brf_file_format = new QName("brf-file-format");
    private static final QName _brf_name_pattern = new QName("brf-name-pattern");
    private static final QName _brf_number_width = new QName("brf-number-width");
    private static final QName _optional_date = new QName("optional-date");

    private static final String PEF_NAMESPACE = "http://www.daisy.org/ns/2008/pef";
    private static final String BRAILLE_DIGITS = "⠚⠁⠃⠉⠙⠑⠋⠛⠓⠊";
    // whitespace (optional from print number)(optional until page number) whitespace (optional page number)
    private static final Pattern HEADER = Pattern.compile("[\u2800\\s]+(⠼[" + BRAILLE_DIGITS + "]+|)([\u2800\\s]?⠤[\u2800\\s]?⠼[" + BRAILLE_DIGITS + "]+|)[\u2800\\s]+(⠼[" + BRAILLE_DIGITS + "]+|)");
    private static final DateTimeFormatter DAY_MONTH_YEAR = DateTimeFormatter.ofPattern("d-M-Y");

    private final MemoizingProvider fileFormatProvider;

    private ReadablePipe source = null;
    private WritablePipe result = null;
    
    private MetadataStep(XProcRuntime runtime, XAtomicStep step, MemoizingProvider fileFormatProvider) {
        super(runtime, step);
        this.fileFormatProvider = fileFormatProvider;
    }

    @Override
    public void setInput(String port, ReadablePipe pipe) {
        source = pipe;
    }

    @Override
    public void setOutput(String port, WritablePipe pipe) {
        result = pipe;
    }

    @Override
    public void reset() {
        source.resetReader();
        result.resetWriter();
    }

    @Override
    public void run() throws SaxonApiException {
        super.run();

        try {

            XdmNode pef = source.read();

            /*
            The XQuery is not used anymore. Instead, we use Java to identify
            the page numbers from the header (the first line of each page).
            This is for two reasons:
            
            1. easy parsing of headers, incl. print page numbers
            2. possibility to adjust to unexpected numbering in PEF
               in duplex mode, each section should start at an odd page
               but Dotify seems to behave a little strange now and then
               so we adjust the metadata to what Dotify produces
            
            */
            // String xquery = getOption(_xquery, "");
            String identifier = getOption(_identifier, "");
            String brfFileFormat = getOption(_brf_file_format, "");
            String brfNamePattern = getOption(_brf_name_pattern, "");
            int brfNumberWidth = getOption(_brf_number_width, 0);
            String optionalDate = getOption(_optional_date, "");

            String brfFileExtension = getFileExtension(brfFileFormat);

            /*
            InputStream query = new URL(xquery).openConnection().getInputStream();
            
            XQueryCompiler xqCompiler = runtime.getProcessor().newXQueryCompiler();
            XQueryExecutable xqExecutable = xqCompiler.compile(query);
            XQueryEvaluator xqEvaluator = xqExecutable.load();

            xqEvaluator.setSource(pef.asSource());
            xqEvaluator.setExternalVariable(_identifier, new XdmAtomicValue(identifier));
            xqEvaluator.setExternalVariable(_brf_name_pattern, new XdmAtomicValue(brfNamePattern));
            xqEvaluator.setExternalVariable(_brf_number_width, new XdmAtomicValue(brfNumberWidth));
            xqEvaluator.setExternalVariable(_brf_file_extension, new XdmAtomicValue(brfFileExtension));
            xqEvaluator.setExternalVariable(_optional_date, new XdmAtomicValue(optionalDate));

            XdmValue xqResult = xqEvaluator.evaluate();
            
            // get the first node from the result
            XdmNode metadata = null;
            for (XdmValue xqValue : xqResult) {
                if (xqValue instanceof XdmNode) {
                    metadata = (XdmNode)xqValue;
                    break;
                }
            };
            */
            
            Book book = parsePEF(pef);
            String metadataXml = createMetadataXml(book, identifier, brfNamePattern, brfNumberWidth, brfFileExtension, optionalDate);
            DocumentBuilder documentBuilder = runtime.getProcessor().newDocumentBuilder();
            XdmNode metadata = documentBuilder.build(new StreamSource(new StringReader(metadataXml)));

            result.write(metadata);
            
        } catch (Exception e) {

            logger.error("dedicon:metadata failed", e);
            throw new XProcException(step.getNode(), e);

        }
    }

    private String getFileExtension (String fileFormatQuery) {
        Query.MutableQuery q = mutableQuery(query(fileFormatQuery));
        Iterable fileFormats = fileFormatProvider.get(q);
        String fileExtension = "";
        for (FileFormat fileFormat : fileFormats) {
            fileExtension = fileFormat.getFileExtension();
            break;
        }
        return fileExtension;
    }
    
    private Book parsePEF(XdmNode pef) {
        Book book = new Book();
        getChildren(pef, PEF_NAMESPACE, "pef").forEach(pefRoot -> {
            getChildren(pefRoot, PEF_NAMESPACE, "body").forEach(pefBody -> {
                getChildren(pefBody, PEF_NAMESPACE, "volume").forEach(pefVolume -> {
                    Volume volume = new Volume();
                    book.getVolumes().add(volume);
                    String pefDuplex = pefVolume.getAttributeValue(new QName("duplex"));
                    volume.setDuplex("true".equalsIgnoreCase(pefDuplex));
                    getChildren(pefVolume, PEF_NAMESPACE, "section").forEach(pefSection -> {
                        Section section = new Section();
                        volume.getSections().add(section);
                        getChildren(pefSection, PEF_NAMESPACE, "page").forEach(pefPage -> {
                            Page page = new Page();
                            setPageNumbers(page, pefPage);
                            section.getPages().add(page);
                        });
                    });
                });
            });
        });
        determineVolumeMetadata(book);
        return book;
    }
    
    private void setPageNumbers(Page page, XdmNode pefPage) {
        for(XdmNode pefRow : getChildren(pefPage, PEF_NAMESPACE, "row")) {
            String header = pefRow.getStringValue();
            setPageNumbers(page, header);
            break;
        }
    }
    
    // this method should be removed in the future because
    // newer versions of Saxon have a children() method in class XdmNode
    private List getChildren(XdmNode parent, String namespace, String child) {
        List children = new ArrayList<>();
        XdmSequenceIterator iterator = parent.axisIterator(CHILD, new QName(namespace, child));
        while (iterator.hasNext()) {
            XdmItem item = iterator.next();
            if (item instanceof XdmNode) {
                children.add((XdmNode)item);
            }
        }
        return children;
    }
    
    private void setPageNumbers(Page page, String header) {
        Matcher pageNumbersMatcher = HEADER.matcher(header);
        if (pageNumbersMatcher.find()) {
            String fromPrintPageNumber = afterNumberSign(pageNumbersMatcher.group(1));
            String untilPrintPageNumber = afterNumberSign(pageNumbersMatcher.group(2));
            String pageNumber = afterNumberSign(pageNumbersMatcher.group(3));
            
            // if there is only one print page number, set the other one
            if (fromPrintPageNumber.length() > 0 && untilPrintPageNumber.length() == 0) {
                untilPrintPageNumber = fromPrintPageNumber;
            }
            if (fromPrintPageNumber.length() == 0 && untilPrintPageNumber.length() > 0) {
                fromPrintPageNumber = untilPrintPageNumber;
            }
            
            page.setFromPrintPageNumber(getNumber(fromPrintPageNumber));
            page.setUntilPrintPageNumber(getNumber(untilPrintPageNumber));
            page.setPageNumber(getNumber(pageNumber));
        }
        
        // no page number found - do nothing
    }
    
    private String afterNumberSign(String brailleNumber) {
        int index = brailleNumber.indexOf('⠼');
        if (index >= 0) {
            return brailleNumber.substring(index + 1);
        }
        // no number sign found
        return "";
    }
    
    private Integer getNumber(String brailleNumber) {
        if (brailleNumber.length() == 0) {
            return null;
        }
        
        int number = 0;
        for (int i = 0; i < brailleNumber.length(); i++) {
            char brailleDigit = brailleNumber.charAt(i);
            int digit = BRAILLE_DIGITS.indexOf(brailleDigit);   // digit >= 0
            number = number*10 + digit;
        }
        return number;
    }
    
    private void determineVolumeMetadata(Book book) {
        Volume previousVolume = null;
        int expectedFirstPageNumber = 1;
        for (Volume volume : book.getVolumes()) {
            volume.setFirstPrintPageNumber(getFirstPrintPageNumber(volume));
            volume.setLastPrintPageNumber(getLastPrintPageNumber(volume));
            expectedFirstPageNumber = setPageNumbers(volume, expectedFirstPageNumber);
            // adjust the last page number of the previous volume
            if (previousVolume != null) {
                previousVolume.setLastPageNumber(volume.getFirstPageNumber() - 1);
            }
            previousVolume = volume;
        }
    }
        
    private Integer getFirstPrintPageNumber(Volume volume) {
        return volume.getSections().stream()
                .map(Section::getPages)
                .flatMap(Collection::stream)
                .map(Page::getFromPrintPageNumber)
                .filter(Objects::nonNull)
                .findFirst()
                .orElse(null);
    }
        
    // this will actually return the highest print page number, not the last one
    // which is good, esp. in the case that the last volume ends with a TOC
    // with out-of-range print page numbers
    //
    // another stategy would be to discard print page numbers lower than
    // the highest one of the previous volume
    private Integer getLastPrintPageNumber(Volume volume) {
        return volume.getSections().stream()
                .map(Section::getPages)
                .flatMap(Collection::stream)
                .map(Page::getUntilPrintPageNumber)
                .filter(Objects::nonNull)
                .sorted(Collections.reverseOrder())
                .findFirst()
                .orElse(null);
    }

    /**
     * Sets the page numbers and returns the expected first page number of the next volume
     * 
     * @param volume
     * @param expectedFirstPageNumber
     * @return expected first page number of next volume
     */
    private int setPageNumbers(Volume volume, int expectedFirstPageNumber) {
        volume.setFirstPageNumber(expectedFirstPageNumber);
        volume.setLastPageNumber(expectedFirstPageNumber);
        int lastPageNumber = expectedFirstPageNumber;
        boolean pageNumberFound = false;
        for (Section section : volume.getSections()) {
            int pagesInThisSection = 0;
            for (Page page : section.getPages()) {
                if (page.getPageNumber() != null) {
                    if (!pageNumberFound) {
                        // the first page number is adjusted
                        volume.setFirstPageNumber(volume.getFirstPageNumber() + page.getPageNumber() - lastPageNumber);
                        pageNumberFound = true;
                    }
                    lastPageNumber = page.getPageNumber();
                }
                volume.setLastPageNumber(lastPageNumber);
                lastPageNumber ++;
                pagesInThisSection ++;
            }
            
            // add one in case of duplex mode and the section has an odd number of pages
            if (volume.getDuplex()) {
                lastPageNumber += pagesInThisSection % 2;
            }
        }
        
        return lastPageNumber;
    }
    
    private String createMetadataXml(Book book, String identifier, String brfNamePattern, int brfNumberWidth, String brfFileExtension, String optionalDate) throws SaxonApiException {
        String date = optionalDate;
        if (date == null || date.length() == 0) {
            date = LocalDate.now().format(DAY_MONTH_YEAR);
        }
        StringBuilder xml = new StringBuilder();
        xml.append("").append(identifier).append("");
        int volumeIndex = 0;
        for (Volume volume : book.getVolumes()) {
            volumeIndex ++;
            xml.append("");
            xml.append("").append(getFilename(volumeIndex, brfNamePattern, brfNumberWidth, brfFileExtension)).append("");
            xml.append("br");
            xml.append("").append(volumeIndex).append("");
            xml.append("");
            if (volume.getFirstPrintPageNumber() != null) {
                xml.append(volume.getFirstPrintPageNumber());
            }
            xml.append("");
            xml.append("");
            if (volume.getLastPrintPageNumber() != null) {
                xml.append(volume.getLastPrintPageNumber());
            }
            xml.append("");
            xml.append("");
            if (volume.getFirstPrintPageNumber() != null && volume.getLastPrintPageNumber() != null) {
                xml.append(volume.getLastPrintPageNumber() - volume.getFirstPrintPageNumber() + 1);
            }
            xml.append("");
            xml.append("");
            if (volume.getFirstPageNumber() != null) {
                xml.append(volume.getFirstPageNumber());
            }
            xml.append("");
            xml.append("");
            if (volume.getLastPageNumber() != null) {
                xml.append(volume.getLastPageNumber());
            }
            xml.append("");
            xml.append("");
            if (volume.getFirstPageNumber() != null && volume.getLastPageNumber() != null) {
                xml.append(volume.getLastPageNumber() - volume.getFirstPageNumber() + 1);
            }
            xml.append("");
            xml.append("").append(volumeIndex == book.getVolumes().size() ? "Y" : "N").append("");
            xml.append("").append(date).append("");
            xml.append("");          
        }
        String document = "".concat(xml.toString()).concat("");
        return document;
    }

    private String getFilename(int volumeIndex, String brfNamePattern, int brfNumberWidth, String brfFileExtension) {
        String brfNumber = String.valueOf(volumeIndex);
        while (brfNumber.length() < brfNumberWidth) {
            brfNumber = "0" + brfNumber;
        }
        return brfNamePattern.replace("{}", brfNumber) + brfFileExtension;
    }
    
    @Component(
            name = "dedicon:metadata",
            service = {XProcStepProvider.class},
            property = {"type:String={http://www.dedicon.nl}metadata"}
    )
    public static class Provider implements XProcStepProvider {

        private List fileFormatProviders = new ArrayList<>();
        private MemoizingProvider fileFormatProvider = memoize(dispatch(fileFormatProviders));

        @Override
        public XProcStep newStep(XProcRuntime runtime, XAtomicStep step) {
            return new MetadataStep(runtime, step, fileFormatProvider);
        }

        @Reference(
                name = "FileFormatProvider",
                unbind = "unbindFileFormatProvider",
                service = FileFormatProvider.class,
                cardinality = ReferenceCardinality.MULTIPLE,
                policy = ReferencePolicy.DYNAMIC
        )
        protected void bindFileFormatProvider(FileFormatProvider provider) {
                fileFormatProviders.add(provider);
        }

        protected void unbindFileFormatProvider(FileFormatProvider provider) {
                fileFormatProviders.remove(provider);
                this.fileFormatProvider.invalidateCache();
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy