All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.pageseeder.flint.ixml.IndexParser Maven / Gradle / Ivy

/*
 * Copyright 2015 Allette Systems (Australia)
 * http://www.allette.com.au
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.pageseeder.flint.ixml;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;

import javax.xml.transform.sax.SAXResult;

import org.pageseeder.flint.IndexException;
import org.pageseeder.flint.indexing.FlintDocument;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/**
 * This handler makes a Lucene 5 Document out of a properly formatted XML document.
 *
 * 

The XML document must validate the Lucene Index Document DTD. * *

For example: * *

{@code
 * 
 *   Jan 12, 02
 *   C:\documents\00023.xml
 *   /documents/doc-23.xml
 *   
 *     Truly to speak, and with no addition,
 *     We go to gain a little patch of ground
 *     That hath in it no profit but the name.
 *     To pay five ducats, five, I would not farm it;
 *     Nor will it yield to Norway or the Pole
 *     A ranker rate, should it be sold in fee.
 *   
 * 
 * }
* * @see Index Documents 1.0 Schema * @see Index Documents 2.0 Schema * @see Index Documents 5.0 Schema * * @author Christophe Lauret * @author Jean-Baptiste Reure * * @version 1 September 2015 */ public final class IndexParser { /** * THe XML reader to use. */ private final XMLReader _reader; /** * THe XML reader to use. */ private final SAXResult _result; /** * THe XML reader to use. */ private final IndexDocumentHandler _handler; /** * Creates a new IndexParser. * * @param reader The XML reader to use. */ IndexParser(XMLReader reader) { this._reader = reader; this._reader.setEntityResolver(FlintEntityResolver.getInstance()); this._reader.setErrorHandler(new FlintErrorHandler()); this._result = null; this._handler = null; } /** * Creates a new IndexParser. * * @param catalog The.. */ IndexParser(String catalog) { this._reader = null; this._handler = new AutoHandler(catalog); this._result = new SAXResult(this._handler); } //public methods ----------------------------------------------------------------------- public SAXResult getResult() { return this._result; } public List getDocuments() { return this._handler == null ? null : this._handler.getDocuments(); } /** * Make a collection Lucene documents to be indexed from the XML file given. * *

The XML file must conform to the DTD defined in this class. * *

Ensure that the reader uses the correct encoding. * * @param source The source to read. * @param catalog The catalog to add the fields to. * * @return A collection of Lucene documents made from the file. * * @throws IndexException Should an error occur while parsing the file. */ public synchronized List process(InputSource source, String catalog) throws IndexException { try { IndexDocumentHandler handler = new AutoHandler(catalog); this._reader.setContentHandler(handler); this._reader.parse(source); return handler.getDocuments(); } catch (SAXException ex) { throw new IndexException("An SAX error occurred while parsing source "+source.getSystemId()+": "+ex.getMessage(), ex); } catch (IOException ex) { throw new IndexException("An I/O error occurred while parsing the file "+source.getSystemId()+": "+ex.getMessage(), ex); } } /** * Returns a list of Lucene documents to be indexed from the XML file given. * *

The XML file must conform to the DTD defined in this class. * * @param f The file to be read. * @param catalog The catalog to add the fields to. * * @return A collection of Lucene documents made from the file. * * @throws IndexException Should an error occur while parsing the file. */ public synchronized List process(File f, String catalog) throws IndexException { try { InputSource source = new InputSource(new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8)); source.setSystemId(f.toURI().toURL().toExternalForm()); return process(source, catalog); } catch (IOException ex) { throw new IndexException("I/O error occurred while generating file input source: "+ex.getMessage(), ex); } } // Inner class to determine which handler to use -------------------------------------------------- /** * A content handler to determine the version used. * * @author Christophe Lauret * @version 1 March 2010 */ private static final class AutoHandler extends DefaultHandler implements IndexDocumentHandler { /** * The catalog to use. */ private final String _catalog; /** * The handler in use. */ private IndexDocumentHandler _handler; /** * @param catalog the catalog to associate the fields with */ public AutoHandler(String catalog) { this._catalog = catalog; } /** * Once element "documents" is matched, the reader is assigned the appropriate handler. *

* {@inheritDoc} */ @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { if ("documents".equals(qName) || "documents".equals(localName)) { String version = atts.getValue("version"); // Version 5.0 if ("5.0".equals(version)) { this._handler = new IndexDocumentHandler_5_0(this._catalog); // Version 2.0 } else if ("2.0".equals(version)) { this._handler = new IndexDocumentHandler_2_0(); // Assume version 1.0 } else { throw new SAXException("Unsupported iXML version "+version+", only 2.0 and 5.0 are supported"); } // Start processing the document with the new handler this._handler.startDocument(); } if (this._handler != null) { this._handler.startElement(uri, localName, qName, atts); } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if (this._handler != null) this._handler.endElement(uri, localName, qName); } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (this._handler != null) this._handler.characters(ch, start, length); } @Override public void endDocument() throws SAXException { if (this._handler != null) this._handler.endDocument(); } /** * {@inheritDoc} */ @Override public List getDocuments() { if (this._handler == null) return Collections.emptyList(); return this._handler.getDocuments(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy