All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.xml.JulieXMLTools Maven / Gradle / Ivy

/**
 * Utils.java
 * 

* Copyright (c) 2010, JULIE Lab. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Common Public License v1.0 *

* Author: chew *

* Current version: 1.0 * Since version: 1.0 *

* Creation date: 16.11.2010 **/ package de.julielab.xml; import com.ximpleware.*; import com.ximpleware.EOFException; import com.ximpleware.extended.*; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.sql.Timestamp; import java.util.*; import java.util.stream.Collectors; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; interface FieldValueSource { Object getFieldValue() throws FieldValueRetrievalException; } /** * Utility class offering convenience methods. * * @author faessler */ public class JulieXMLTools { public static final int ELEMENT_FRAGMENT = 0; public static final int CONTENT_FRAGMENT = 1; static final Logger LOG = LoggerFactory.getLogger(JulieXMLTools.class); /** * Convenience method for quick construction of a row iterator over an XML * document. *

*

* The fileName determines the location of the XML file to * return data records from. For more detailed information see * {@link #constructRowIterator(VTDNav, String, List, String)}. * * @param fileName XML file to return data rows from. * @param bufferSize Size of buffers while reading the file at * fileName. * @param forEachXpath An XPath expression determining the XML elements to retrieve * data records from. * @param fields List of attribute-value pairs determining the record fields * returned by the iterator. * @return An iterator over all rows extracted from the XMl document pointed * to by fileName. */ public static Iterator> constructRowIterator(String fileName, int bufferSize, String forEachXpath, final List> fields, boolean largeFileSize) { try { if (largeFileSize) { return constructRowIteratorHuge(fileName, forEachXpath, fields); } else { InputStream is; if (fileName.endsWith(".gz") || fileName.endsWith(".gzip")) { is = new GZIPInputStream(new FileInputStream(fileName)); } else if (fileName.endsWith(".zip")) { LOG.info("Got a ZIP archive at {}. It will be scanned for XML entry files.", fileName); ZipFile zipFile = new ZipFile(fileName, StandardCharsets.UTF_8); final List sortedEntries = zipFile.stream().sorted(Comparator.comparing(ZipEntry::getName)).collect(Collectors.toList()); return new Iterator>() { private Iterator zipEntryIt = sortedEntries.iterator(); private ZipEntry entry = nextZipEntry(); private Iterator> internalIterator; private Map nextRow; @Override public boolean hasNext() { if (nextRow == null) { if (internalIterator != null && internalIterator.hasNext()) { nextRow = internalIterator.next(); } else if (entry != null) { while ((internalIterator == null || !internalIterator.hasNext()) && entry != null) { if (entry.isDirectory() || !hasValidEnding(entry.getName())) { LOG.info("Skipping ZIP entry {}", entry.getName()); entry = nextZipEntry(); continue; } VTDNav vn = null; try { LOG.info("Processing ZIP entry {}", entry.getName()); InputStream entryIs = zipFile.getInputStream(entry); if (entry.getName().toLowerCase().endsWith(".gz") || entry.getName().toLowerCase().endsWith("gzip")) entryIs = new GZIPInputStream(entryIs); vn = getVTDNav(entryIs, bufferSize); } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } internalIterator = constructRowIterator(vn, forEachXpath, fields, fileName); entry = nextZipEntry(); } nextRow = internalIterator.next(); } } return nextRow != null; } private boolean hasValidEnding(String filename) { String lc = filename; return lc.endsWith("xml") || lc.endsWith("xml.gz") || lc.endsWith("xml.gzip"); } @Override public Map next() { hasNext(); Map ret = nextRow; nextRow = null; return ret; } private ZipEntry nextZipEntry() { return zipEntryIt.hasNext() ? zipEntryIt.next() : null; } }; } else { is = new FileInputStream(fileName); } VTDNav vn = getVTDNav(is, bufferSize); return constructRowIterator(vn, forEachXpath, fields, fileName); } } catch (FileNotFoundException e) { LOG.error(String.format("File %s could not be found.", fileName)); e.printStackTrace(); } catch (FileTooBigException e) { try { LOG.info("Falling back on VTD XML 'Huge' parser for large XML files..."); return constructRowIteratorHuge(fileName, forEachXpath, fields); } catch (ParseExceptionHuge e1) { LOG.error("Error while parsing file " + fileName + ": ", e1.getMessage()); e1.printStackTrace(); System.exit(1); } catch (IOException e1) { e1.printStackTrace(); } } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { LOG.error("Error while parsing file " + fileName + ": ", e.getMessage()); e.printStackTrace(); System.exit(1); } catch (ParseExceptionHuge e) { LOG.error("Error while parsing file " + fileName + ": ", e.getMessage()); e.printStackTrace(); System.exit(1); } return null; } /** * @param fileName * @param forEachXpath * @param fields * @return * @throws IOException * @throws ParseExceptionHuge * @throws EncodingExceptionHuge * @throws EOFExceptionHuge * @throws EntityExceptionHuge */ private static Iterator> constructRowIteratorHuge(String fileName, String forEachXpath, final List> fields) throws IOException, ParseExceptionHuge { JulieXMLBuffer buffer = new JulieXMLBuffer(); buffer.readFile(fileName); VTDGenHuge vg = new VTDGenHuge(); vg.setDoc(buffer); vg.parse(true); VTDNavHuge vn = vg.getNav(); return constructRowIterator(vn, forEachXpath, fields, fileName); } /** * Convenience method for quick construction of a row iterator over an XML * document. * * * data contains the XML data to return data records from. For * more detailed information see * {@link #constructRowIterator(VTDNav, String, List, String)}. * * @param data Byte array containing an XML document. * @param bufferSize Size of buffers while reading the file at * fileName. * @param forEachXpath An XPath expression determining the XML elements to retrieve * data records from. * @param fields List of attribute-value pairs determining the record fields * returned by the iterator. * @param identifier A string identifying the XML document in data, * needed for error messages. * @return An iterator over all rows extracted from the XMl document pointed * to by fileName. */ public static Iterator> constructRowIterator(byte[] data, int bufferSize, String forEachXpath, List> fields, String identifier) { try { VTDGen vg = new VTDGen(); vg.setDoc(data); vg.parse(true); VTDNav vn = vg.getNav(); return constructRowIterator(vn, forEachXpath, fields, identifier); } catch (ParseException e) { LOG.error("Error while parsing document " + identifier); e.printStackTrace(); System.exit(1); } return null; } /** * This method corresponds to * {@link #constructRowIterator(VTDNav, String, List, String)} but employs * the VTD-XML-Huge API to extract data from very large XML documents. Due * to API differences, there are a few shortcomings however:
*

    *
  • The iterator returned by this method is not able to automatically * extract namespace URIs. If namespace-aware processing is necessary, * namespace prefixes and their corresponding URIs must be made manually * configurable. *
* * @param vn VTDNavHuge object to the XML object. The object * should not have been used before, so the current VTD index * points to the document root. * @param forEachXpath An absolute XPath expression determining the slice(s) of the * XML document from which data rows are extracted. * @param fields The fields to be returned with each data row. * @param identifier Used in error messages * @return An iterator over all rows extracted from the XMl document * navigated by vn. */ private static Iterator> constructRowIterator(VTDNavHuge vn, String forEachXpath, List> fields, String identifier) { final AutoPilotHuge ap = new AutoPilotHuge(vn); try { // starting conditions ap.selectXPath(forEachXpath); final int startIndex = ap.evalXPath(); if (startIndex == -1) LOG.debug("Couldn't find XPath: " + forEachXpath + " in document " + identifier); final Map navigators = new HashMap(); for (Map field : fields) { String xPath = field.get(JulieXMLConstants.XPATH); Options options = new Options(); String fieldName = field.get(JulieXMLConstants.NAME); if (xPath != null) { AutoPilotHuge pilot = new AutoPilotHuge(vn); AutoPilotHuge pilotForEach = new AutoPilotHuge(vn); String fieldForEach = field.get(JulieXMLConstants.FOR_EACH); // Default: The XPath attribute directly holds the path to // the desired value. The ForEachAP is the main navigator, // so it must select the XPath if no specific ForEachXPath // is given. pilotForEach.selectXPath(xPath); pilot.selectXPath(xPath); if (fieldForEach != null) { pilotForEach = new AutoPilotHuge(vn); pilotForEach.selectXPath(fieldForEach); } else { // If there is no ForEachXPath indeed, the ForEachAP // always selects the path. The XPathAP is nearly // unemployed now: It just needs to point to the spot // the ForEachXPath navigated to. pilot.selectXPath("."); } options.returnXMLFragment = Boolean.parseBoolean(field.get(JulieXMLConstants.RETURN_XML_FRAGMENT)); options.returnArray = Boolean.parseBoolean(field.get(JulieXMLConstants.RETURN_ARRAY)); // options.returnAttributeValue = xPath // .matches(REGEX_XPATH_ATTR); options.concatString = field.get(JulieXMLConstants.CONCAT_STRING); if (options.concatString == null) options.concatString = ","; options.performGzip = Boolean.parseBoolean(field.get(JulieXMLConstants.GZIP)); navigators.put(fieldName, new XPathNavigatorHuge(vn, pilotForEach, pilot, options)); } else if (Boolean.parseBoolean(field.get(JulieXMLConstants.EXTRACT_FROM_FILENAME))) { String[] path = identifier.split("/"); navigators.put(fieldName, new FileNameValueSource(path[path.length - 1], field)); } else if (field.get(JulieXMLConstants.CONSTANT_VALUE) != null) { String value = field.get(JulieXMLConstants.CONSTANT_VALUE); navigators.put(fieldName, new ConstantFieldValueSource(value)); } else { LOG.warn("Field with name \"" + fieldName + "\" does not define a source to get a value from (e.g. XML XPath or file name) and will not have imported any values."); } } return new Iterator>() { int index = startIndex; public boolean hasNext() { return index != -1; } public Map next() { if (!hasNext()) return null; Map row = new HashMap(); try { for (String fieldName : navigators.keySet()) { FieldValueSource navi = navigators.get(fieldName); vn.push(); Object fieldValue = navi.getFieldValue(); vn.pop(); row.put(fieldName, fieldValue); } row.put(JulieXMLConstants.VTD_INDEX, index); index = ap.evalXPath(); return row; } catch (XPathEvalExceptionHuge e) { e.printStackTrace(); } catch (NavExceptionHuge e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } return null; } public void remove() { } }; } catch (XPathEvalException e) { e.printStackTrace(); } catch (NavException e) { e.printStackTrace(); } catch (VTDExceptionHuge e) { e.printStackTrace(); } return null; } /** *

* The VTDNav vn is a VTD navigator over the XML file to return * data records from. For each evaluation of the forEach XPath * expression, one data row is created *

*

* Such a row consist of the fields given by the list fields * The list contains Maps of attribute-value pairs. All fields * are required to have a {@link JulieXMLConstants#XPATH} attribute which specifies * the XPath pointing to information in the XML documents to retrieve. * Likewise, a {@link JulieXMLConstants#NAME} attribute is required. This attribute * determines the name of the field in the resulting row containing the * information retrieved by the field's Constants.XPATH * attribute. *

*

* Example: *

* A field with the following attribute-value-pairs *

*

* <field name="pmid" * xpath="/MedlineCitationSet/MedlineCitation/PMID" > *

* will create one field in each returned data row named "pmid" and its * value will by the character data at the XPath * "/MedlineCitationSet/MedlineCitation/PMID". *

* * @param vn The {@link VTDNav} object which navigates over the XML * document to retrieve records from. * @param forEachXpath An XPath expression determining the XML elements for each of * which one row should be created. * @param fields The fields to be returned with each data row. * @return An iterator over all rows extracted from the XMl document * navigated by vn. */ public static Iterator> constructRowIterator(final VTDNav vn, String forEachXpath, final List> fields, String identifier) { final AutoPilot ap = new AutoPilot(vn); try { Map namespaceMap = buildNamespaceMap(vn.duplicateNav()); // starting conditions declareNamespaces(ap, namespaceMap); ap.selectXPath(forEachXpath); final int startIndex = ap.evalXPath(); if (startIndex == -1) LOG.debug("Couldn't find XPath: " + forEachXpath + " in document " + identifier); final Map navigators = new HashMap(); for (Map field : fields) { String xPath = field.get(JulieXMLConstants.XPATH); final Options options = new Options(); String fieldName = field.get(JulieXMLConstants.NAME); if (xPath != null) { AutoPilot pilot = new AutoPilot(vn); AutoPilot pilotForEach = new AutoPilot(vn); String fieldForEach = field.get(JulieXMLConstants.FOR_EACH); declareNamespaces(pilot, namespaceMap); declareNamespaces(pilotForEach, namespaceMap); // Default: The XPath attribute directly holds the path to // the desired value. The ForEachAP is the main navigator, // so it must select the XPath if no specific ForEachXPath // is given. pilotForEach.selectXPath(xPath); pilot.selectXPath(xPath); if (fieldForEach != null) { pilotForEach = new AutoPilot(vn); pilotForEach.selectXPath(fieldForEach); } else { // If there is no ForEachXPath indeed, the ForEachAP // always selects the path. The XPathAP is nearly // unemployed now: It just needs to point to the spot // the ForEachXPath navigated to. pilot.selectXPath("."); } options.returnXMLFragment = Boolean.parseBoolean(field.get(JulieXMLConstants.RETURN_XML_FRAGMENT)); options.returnArray = Boolean.parseBoolean(field.get(JulieXMLConstants.RETURN_ARRAY)); // options.returnAttributeValue = xPath // .matches(REGEX_XPATH_ATTR); options.resolveEntities = Boolean.parseBoolean(field.get(JulieXMLConstants.RESOLVE_ENTITIES)); options.concatString = field.get(JulieXMLConstants.CONCAT_STRING); if (options.concatString == null) options.concatString = ","; options.performGzip = Boolean.parseBoolean(field.get(JulieXMLConstants.GZIP)); navigators.put(fieldName, new XPathNavigator(vn, pilotForEach, pilot, options)); } else if (Boolean.parseBoolean(field.get(JulieXMLConstants.EXTRACT_FROM_FILENAME))) { String[] path = identifier.split("/"); navigators.put(fieldName, new FileNameValueSource(path[path.length - 1], field)); } else if (Boolean.parseBoolean(field.get(JulieXMLConstants.TIMESTAMP))) { navigators.put(fieldName, new TimestampValueSource()); } else if (field.get(JulieXMLConstants.CONSTANT_VALUE) != null) { String value = field.get(JulieXMLConstants.CONSTANT_VALUE); navigators.put(fieldName, new ConstantFieldValueSource(value)); } else { LOG.warn("Field with name \"" + fieldName + "\" does not define a source to get a value from (e.g. XML XPath or file name) and will not have imported any values."); } } return new Iterator>() { int index = startIndex; public boolean hasNext() { return index != -1; } public Map next() { if (!hasNext()) return null; Map row = new HashMap(); try { for (String fieldName : navigators.keySet()) { FieldValueSource navi = navigators.get(fieldName); vn.push(); Object fieldValue = navi.getFieldValue(); vn.pop(); row.put(fieldName, fieldValue); } row.put(JulieXMLConstants.VTD_INDEX, index); index = ap.evalXPath(); return row; } catch (XPathEvalException e) { e.printStackTrace(); } catch (NavException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } return null; } public void remove() { } }; } catch (XPathEvalException e) { e.printStackTrace(); } catch (NavException e) { e.printStackTrace(); } catch (XPathParseException e) { e.printStackTrace(); } catch (VTDException e) { e.printStackTrace(); } return null; } /** * Declares the given namespaces to the passed auto pilot. The namespaceMap * can automatically be derived from an XML document by calling {@link #buildNamespaceMap(VTDNav)}. * * @param ap * @param namespaceMap */ public static void declareNamespaces(AutoPilot ap, Map namespaceMap) { for (Map.Entry entry : namespaceMap.entrySet()) ap.declareXPathNameSpace(entry.getKey(), entry.getValue()); } /** * Reads the namespace axis of the XML document associated with vn and returns * a map connecting the namespace prefixes with their URI. This map can be passed to * {@link #declareNamespaces(AutoPilot, Map)} to declare all the namespaces of the document * to an {@link AutoPilot}. * * @param vn * @return * @throws VTDException */ public static Map buildNamespaceMap(VTDNav vn) throws VTDException { Map namespaceMap = new HashMap<>(); AutoPilot ap = new AutoPilot(vn); ap.selectXPath("//namespace::*"); String nsDeclaration = null; try { int i; while ((i = ap.evalXPath()) != -1) { nsDeclaration = vn.toString(i); if (nsDeclaration.contains(":")) { String nsPrefix = nsDeclaration.split(":")[1]; String nsUrl = vn.toString(i + 1); namespaceMap.put(nsPrefix, nsUrl); } } } catch (ArrayIndexOutOfBoundsException e) { LOG.error( "This algorithm expects XML namespace declarations to be of the form \"xmlns:\". The declaration actually was: \"" + nsDeclaration + "\"", e); } return namespaceMap; } public static VTDNav getVTDNav(InputStream is, int bufferSize) throws ParseException, FileTooBigException { VTDGen vg = null; try { byte[] data = readStream(is, bufferSize); vg = new VTDGen(); vg.setDoc(data); vg.parse(true); } catch (EncodingException e) { e.printStackTrace(); } catch (EOFException e) { e.printStackTrace(); } catch (EntityException e) { e.printStackTrace(); } catch (FileTooBigException e) { throw e; } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { String message = e.getMessage(); if (message.contains("file size too big")) throw new FileTooBigException(message); } return vg.getNav(); } /** * Reads an InputStream buffer wise, concatenates all buffers * and returns one byte[] of exact length of the read data. * * @param is InputStream to read. * @param bufferSize Size of maximum bytes to read by one is.read() * call. * @return A byte[] containing all the data of the * InputStream. * @throws IOException */ public static byte[] readStream(InputStream is, int bufferSize) throws IOException { byte[] buffer = new byte[bufferSize]; List bufferList = new ArrayList(); List readBytesList = new ArrayList(); int bytesRead = 0; int allBytesRead = 0; while ((bytesRead = is.read(buffer)) != -1) { bufferList.add(buffer); readBytesList.add(bytesRead); buffer = new byte[bufferSize]; // Overflow-check if (allBytesRead + bytesRead < allBytesRead) { LOG.info("Array size overflow while reading file. The file you are attempting to read " + "is propably greater than 2GB in size. Such files cannot be read using the default VTD XML parser. " + "Consider splitting the file into subfiles of size less than 2GB for using the default parser."); throw new FileTooBigException("Input file could not be read because it is too big (>2GB)"); } allBytesRead += bytesRead; } byte[] streamContent = new byte[allBytesRead]; int pos = 0; try { for (int i = 0; i < bufferList.size(); ++i) { System.arraycopy(bufferList.get(i), 0, streamContent, pos, readBytesList.get(i)); pos += readBytesList.get(i); } } catch (ArrayIndexOutOfBoundsException oob) { LOG.error( "Array index out of bounds - please check whether the file you try to read is less then 2GB in size.", oob); } finally { is.close(); } return streamContent; } public static byte[] gzipData(byte[] data) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); try { GZIPOutputStream os = new GZIPOutputStream(baos); os.write(data); os.close(); return baos.toByteArray(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } public static byte[] unGzipData(byte[] gzipData) throws IOException { ByteArrayInputStream bais = new ByteArrayInputStream(gzipData); GZIPInputStream gzipInputStream = new GZIPInputStream(bais); byte[] data = JulieXMLTools.readStream(gzipInputStream, 1024); return data; } public static URL getSolrServerURL(String urlStr, boolean calledByCLI, Logger LOG) { URL serverURL; try { serverURL = new URL(urlStr); return serverURL; } catch (MalformedURLException e) { String msg = "Solr server URL '" + urlStr + "' is malformed: "; if (calledByCLI) LOG.error(msg + e.getMessage()); else LOG.error(msg, e); } return null; } public static String getElementText(VTDNav vn) throws NavException { StringBuilder sb = new StringBuilder(); int depth = vn.getCurrentDepth(); int i = vn.getCurrentIndex(); while (vn.getTokenType(i) == VTDNav.TOKEN_STARTING_TAG) i++; while (vn.getTokenDepth(i) >= depth && !(vn.getTokenType(i) == VTDNav.TOKEN_STARTING_TAG && vn.getTokenDepth(i) == depth) && i < vn.getTokenCount()) { if (vn.getTokenType(i) == VTDNav.TOKEN_CHARACTER_DATA || vn.getTokenType(i) == VTDNav.TOKEN_CDATA_VAL) sb.append(vn.toString(i)); i++; } return sb.toString(); } /** * Returns the fragment of XML, where vn currently points to, as a * string. * * @param vn The XML navigator. * @param fragmentType Either {@link #ELEMENT_FRAGMENT} or {@link #CONTENT_FRAGMENT}. * Determines which respective method on vn is called. * The first returns the whole element, including starting and * end tag, the latter omits the tags of the element and only * returns its enclosed contents. * @param returnRawString Whether to return a raw string, i.e. the pure XML fragment * without resolving XML entities, or a "readable" string which * then possibly cannot be used for further XML parsing. * @return The XML fragment of the current element vn points to. * @throws NavException */ public static String getFragment(VTDNav vn, int fragmentType, boolean returnRawString) throws NavException { long fragment = fragmentType == ELEMENT_FRAGMENT ? vn.getElementFragment() : vn.getContentFragment(); int offset = (int) fragment; int length = (int) (fragment >> 32); return returnRawString ? vn.toRawString(offset, length) : vn.toString(offset, length); } public static Map createField(String... configuration) { if (configuration.length % 2 == 1) throw new IllegalArgumentException("An even number of arguments is required. The even indexes " + "are field property keys, the odd indexes are the values to the previous key."); Map field = new HashMap<>(); for (int i = 0; i < configuration.length; i = i + 2) { String s = configuration[i]; field.put(s, configuration[i + 1]); } return field; } /** * Sets the text content of an XML element pointed to by xpath * to text. *

* The cursor of vn is moved to the element determined by * xpath. *

* * @param vn VTDNav object navigating the XML document to * modify. * @param ap AutoPilot object bound to vn. * @param xm XMLModifier object bound to vn. * @param xpath An XPath expression pointing to the XML element whose text * should be set. * @param text The text which is to be set to the XML element pointed to by * xpath. * @return The VTD index of the changed element, -1 otherwise. * @throws VTDException If something with navigation or modification of the XML * document goes wrong. * @throws UnsupportedEncodingException */ public static int setElementText(VTDNav vn, AutoPilot ap, XMLModifier xm, String xpath, String text) throws VTDException, UnsupportedEncodingException { ap.selectXPath(xpath); int elementIndex = ap.evalXPath(); LOG.trace("Setting element text to an XML element: Found element XPath {} at VTD token index {} (-1 means not found)", xpath, elementIndex); int textIndex = -1; // Go to the element to change, if existing. if (elementIndex != -1) { textIndex = vn.getText(); // If the element already has text, change it. if (textIndex != -1) { xm.updateToken(textIndex, text); LOG.trace("Element text already existed at token index {} and is replaced.", textIndex); } else { LOG.trace("Element is empty, setting new text."); // If the element is empty, insert the new text. xm.insertAfterHead(text); textIndex = elementIndex + 1; } } LOG.trace("Returning the VTD XML index of the new element text as {}", textIndex); return textIndex; } public static String[] expandArrayEntries(T[] array, String fmtStr) { String[] expandedEntries = new String[array.length]; for (int i = 0; i < expandedEntries.length; i++) { expandedEntries[i] = String.format(fmtStr, array[i]); } return expandedEntries; } public static String[] expandArrayEntries(List list, String fmtStr) { String[] array = new String[list.size()]; list.toArray(array); return expandArrayEntries(array, fmtStr); } public static String[] expandArrayEntries(T[] array, String[] fmtStrs) { if (array.length != fmtStrs.length) throw new IllegalArgumentException( "The size of the array with elements to be expanded must match the size of the array holding the extention format strings."); String[] expandedEntries = new String[array.length]; for (int i = 0; i < expandedEntries.length; i++) { expandedEntries[i] = String.format(fmtStrs[i], array[i]); } return expandedEntries; } public static String getXpathValue(String xpath, AutoPilot ap) throws XPathParseException { ap.selectXPath(xpath); return ap.evalXPathToString(); } public static String getXpathValue(String xpath, VTDNav vn) throws XPathParseException { return getXpathValue(xpath, new AutoPilot(vn)); } public static String getXpathValue(String xpath, InputStream is) throws IOException, XPathParseException, ParseException { VTDGen vg = new VTDGen(); vg.setDoc(readStream(is, 1024)); vg.parse(false); return getXpathValue(xpath, vg.getNav()); } } abstract class AbstractFieldValueSource implements FieldValueSource { protected byte[] gzipContent(Object content) { try { // TODO extend for array if (content instanceof String) return JulieXMLTools.gzipData(((String) content).getBytes("UTF-8")); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return null; } } class ConstantFieldValueSource implements FieldValueSource { protected Object value; protected ConstantFieldValueSource() { } public ConstantFieldValueSource(String value) { this.value = value; } public Object getFieldValue() { return value; } } class FileNameValueSource extends ConstantFieldValueSource { public FileNameValueSource(String fileName, Map field) { String regex = field.get(JulieXMLConstants.REGEX); String replaceWith = field.get(JulieXMLConstants.REPLACE_WITH); value = fileName.replaceAll(regex, replaceWith); } } class TimestampValueSource extends ConstantFieldValueSource { public TimestampValueSource() { value = new Timestamp(System.currentTimeMillis()); } } class Options { public boolean returnXMLFragment; public boolean returnArray; public String concatString; public boolean resolveEntities; public boolean performGzip; } /** * This helper class bundles all classes needed to navigate a particular XPath * expression in an XML document. Additionally, it maintains a pointer 'index' * to the next occurence's position of the XPath expression in the document. * This is needed in case of elements referenced by an XPath which don't occur * in all subtrees defined by the 'forEach' attribute of this EntityProcessor. * Thus, before returning the field value by 'getFieldValue', it must be checked * if the 'forEach' loop already reached the correct position in the document. * This is indicated by the 'forEachIndex' parameter which in fact is always one * 'forEach' element ahead. * * @author faessler */ class XPathNavigator extends AbstractFieldValueSource { private VTDNav vn; private AutoPilot apFE; // AutoPilot "ForEach" private AutoPilot apXP; // AutoPilot "XPath" private Options options; private int vtdIndexOfLastValue = -1; public XPathNavigator(VTDNav nv, AutoPilot apForEach, AutoPilot apXPath, Options options) { this.vn = nv; this.apFE = apForEach; this.apXP = apXPath; this.options = options; } public Object getFieldValue() throws FieldValueRetrievalException { List retList = new ArrayList(); try { while (apFE.evalXPath() != -1) { if (options.returnXMLFragment) { long fragment = vn.getElementFragment(); int offset = (int) fragment; int length = (int) (fragment >> 32); retList.add(options.resolveEntities ? vn.toString(offset, length) : vn.toRawString(offset, length)); } else { retList.add(apXP.evalXPathToString()); } apXP.resetXPath(); } } catch (XPathEvalException | NavException e) { throw new FieldValueRetrievalException(e); } apFE.resetXPath(); Object retobj; if (retList.size() > 0) { if (options.returnArray) retobj = retList.toArray(new String[retList.size()]); else if (retList.size() > 1) retobj = StringUtils.join(retList, options.concatString); else retobj = retList.get(0); if (options.performGzip) return gzipContent(retobj); return retobj; } return null; } } /** * Essentially the same as the XPathNavigator except this version uses the * "Huge" classes from VTD XML intended to process very large XML files. * * @author faessler */ class XPathNavigatorHuge extends AbstractFieldValueSource { private VTDNavHuge vn; private AutoPilotHuge apFE; // AutoPilot "ForEach" private AutoPilotHuge apXP; // AutoPilot "XPath" private Options options; public XPathNavigatorHuge(VTDNavHuge nv, AutoPilotHuge apForEach, AutoPilotHuge apXPath, Options options) throws XPathEvalException, NavException, XPathEvalExceptionHuge, NavExceptionHuge, XPathParseExceptionHuge { this.vn = nv; this.apFE = apForEach; this.apXP = apXPath; if (apFE == null) { this.apFE = this.apXP; this.apXP = new AutoPilotHuge(vn); apXP.selectXPath("."); } this.options = options; } public Object getFieldValue() throws FieldValueRetrievalException { List retList = new ArrayList(); try { while (apFE.evalXPath() != -1) { if (options.returnXMLFragment) { long[] fragment = vn.getElementFragment(); long offset = fragment[0]; long length = fragment[1]; // Assumption: if the user wants the whole XML fragment, // it is likely he wants it to be valid XML, so don't // resolve // entities. try { // getting the XML fragment in the VTD-XML-Huge version // is a bit messy; we need to get the data storage // object // and write the required data into an OutputStream // which // we can read. JulieXMLBuffer mb = (JulieXMLBuffer) vn.getXML(); byte[] fragmentBytes = mb.getFragment(offset, length); retList.add(new String(fragmentBytes)); } catch (ClassCastException e) { JulieXMLTools.LOG.error( "Casting from com.ximpleware.extended.IByteBuffer to " + JulieXMLBuffer.class.getName() + " failed. You must pass an Instance of" + JulieXMLBuffer.class.getName() + " to the VTDGenHuge object which contains the XML data to be parsed."); e.printStackTrace(); } } else { retList.add(apXP.evalXPathToString()); } apXP.resetXPath(); } } catch (XPathEvalExceptionHuge | NavExceptionHuge | IOException e) { throw new FieldValueRetrievalException(e); } apFE.resetXPath(); Object retobj = null; if (retList.size() > 0) { if (options.returnArray) retobj = retList.toArray(new String[retList.size()]); else if (retList.size() > 1) retobj = StringUtils.join(retList, options.concatString); else retobj = retList.get(0); if (options.performGzip) return gzipContent(retobj); return retobj; } return null; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy