All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.unkrig.commons.text.scanner.ScannerUtil Maven / Gradle / Ivy


/*
 * de.unkrig.commons - A general-purpose Java class library
 *
 * Copyright (c) 2011, Arno Unkrig
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
 * following conditions are met:
 *
 *    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
 *       following disclaimer.
 *    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
 *       following disclaimer in the documentation and/or other materials provided with the distribution.
 *    3. The name of the author may not be used to endorse or promote products derived from this software without
 *       specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
 * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

package de.unkrig.commons.text.scanner;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;

import de.unkrig.commons.io.LineUtil;
import de.unkrig.commons.lang.protocol.Predicate;
import de.unkrig.commons.lang.protocol.Producer;
import de.unkrig.commons.lang.protocol.ProducerWhichThrows;
import de.unkrig.commons.nullanalysis.Nullable;
import de.unkrig.commons.text.scanner.AbstractScanner.Token;

/**
 * Various scanner-related utility methods.
 */
public final
class ScannerUtil {

    private ScannerUtil() {}

    /**
     * Converts the {@link StringScanner}, which reads from a {@link CharSequence} into a {@link DocumentScanner},
     * which reads from a {@link Reader} and thinks in "line numbers" and "column numbers").
     *
     * 

{@link IOException}s thrown by the {@link Reader} are wrapped in {@link ScanException}s. */ public static > DocumentScanner toDocumentScanner(final StringScanner stringScanner, final Reader reader) { final ProducerWhichThrows lineProducer = LineUtil.readLineWithSeparator(reader); return new DocumentScanner() { private int lineNumber; @Override @Nullable public Token produce() throws ScanException { for (;;) { Token token = stringScanner.produce(); if (token != null) return token; String line; try { line = lineProducer.produce(); } catch (IOException ioe) { throw new ScanException(ioe); } if (line == null) return null; this.lineNumber++; stringScanner.setInput(line); } } @Override public int getPreviousTokenLineNumber() { return this.lineNumber; } @Override public int getPreviousTokenColumnNumber() { return stringScanner.getPreviousTokenOffset() + 1; } @Override public String toString() { return "Line " + this.getPreviousTokenLineNumber() + ", column " + this.getPreviousTokenColumnNumber(); } }; } /** * Uses a given {@link StringScanner} to scan the contents of a given file. * * @param charset E.g. {@link Charset#forName(String)} or {@link Charset#defaultCharset()} * @see #toDocumentScanner(StringScanner, Reader) */ public static > ProducerWhichThrows, ScanException> scanner(final StringScanner stringScanner, File file, Charset charset) throws FileNotFoundException { return ScannerUtil.augmentScanningLocation(ScannerUtil.toDocumentScanner( stringScanner, new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)) ), file.toString()); } /** * @return A {@link Producer} who's {@link #toString()} method prepends the given {@code prefix}, a colon and a * space to the string returned by the {@code delegate}'s {@link #toString()} method */ public static ProducerWhichThrows augmentScanningLocation( final ProducerWhichThrows delegate, @Nullable final String prefix ) { return new ProducerWhichThrows() { @Override @Nullable public T produce() throws EX { return delegate.produce(); } @Override @Nullable public String toString() { return prefix == null ? delegate.toString() : prefix + ": " + delegate.toString(); } }; } /** * @return A {@link StringScanner} which produces tokens through a {@code delegate}, but only those for which * the {@code predicate} returns {@code true} */ public static > StringScanner filter(final StringScanner delegate, final Predicate> predicate) { return new StringScanner() { @Override @Nullable public Token produce() throws ScanException { for (;;) { Token token = delegate.produce(); if (token == null) return null; if (predicate.evaluate(token)) return token; } } @Override public StringScanner setInput(CharSequence cs) { delegate.setInput(cs); return this; } @Override public int getOffset() { return delegate.getOffset(); } @Override public int getPreviousTokenOffset() { return delegate.getPreviousTokenOffset(); } @Override @Nullable public String toString() { return delegate.toString(); } }; } /** * If set, {@code "\023"} escapes to {@code '\0', '2', '3'}, otherwise, if {@value #UNESCAPE_OCTAL}, it escapes to * {@code '#'}, otherwise, it is an invalid escape sequence. * * @see #unescape(String, int) */ public static final int UNESCAPE_NUL = 1; /** * If set, {@code \"} escapes to a double quote, otherwise it is an invalid escape sequence. * * @see #unescape(String, int) */ public static final int UNESCAPE_DOUBLE_QUOTE = 2; /** * If set, {@code \'} escapes to a single quote, otherwise it is an invalid escape sequence. * * @see #unescape(String, int) */ public static final int UNESCAPE_SINGLE_QUOTE = 4; /** * If set, "\uxxxx" escapes to the unicode character 'xxxx', otherwise it is an * invalid escape sequence. * * @see #unescape(String, int) */ public static final int UNESCAPE_UNICODE = 8; /** * If set, {@code "\123"} escapes to "S", otherwise it is an invalid escape sequence (but see {@link * #UNESCAPE_NUL}). * * @see #unescape(String, int) */ public static final int UNESCAPE_OCTAL = 16; /** * @return The input string, unescaped according to the rules defined in JLS7 * @throws ScanException {@code s} contains control characters * @throws ScanException \\ is not followed by exactly four hex digits * @throws ScanException A backslash is not followed by an allowed character * @throws ScanException {@code s} ends in the middle of an escape sequence * @see #UNESCAPE_NUL * @see #UNESCAPE_DOUBLE_QUOTE * @see #UNESCAPE_SINGLE_QUOTE * @see #UNESCAPE_UNICODE * @see #UNESCAPE_OCTAL */ public static String unescape(String s, int options) throws ScanException { if (s.indexOf('\\') == -1) return s; StringBuilder sb = new StringBuilder(s.length() + 3); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if (Character.isISOControl(c)) throw new ScanException("Character " + (int) c + " not allowed in string"); if (c != '\\') { sb.append(c); continue; } try { c = s.charAt(++i); int idx; if (c == '0' && (options & ScannerUtil.UNESCAPE_NUL) != 0) { sb.append('\0'); } else if (c == '"' && (options & ScannerUtil.UNESCAPE_DOUBLE_QUOTE) != 0) { sb.append('"'); } else if (c == '\'' && (options & ScannerUtil.UNESCAPE_SINGLE_QUOTE) != 0) { sb.append('\''); } else if ((idx = "\\/bfnrt".indexOf(c)) != -1) { sb.append("\\/\b\f\n\r\t".charAt(idx)); } else if (c == 'u' && (options & ScannerUtil.UNESCAPE_UNICODE) != 0) { int h = 0; for (int j = 0; j < 4; j++) { c = s.charAt(++i); int nibble = Character.digit(c, 16); if (nibble == -1) throw new ScanException("'" + c + "' is not a hex digit"); h = (h << 4) + nibble; } sb.append((char) h); } else if (c >= '0' && c <= '7' && (options & ScannerUtil.UNESCAPE_OCTAL) != 0) { int o = Character.digit(c, 8); if (i < s.length() - 1) { int octel = Character.digit(s.charAt(i + 1), 8); if (octel != -1) { o = (o << 3) + octel; i++; if (o <= 31 && i < s.length() - 1) { octel = Character.digit(s.charAt(i + 1), 8); if (octel != -1) { o = (o << 3) + octel; i++; } } } } sb.append((char) o); } else { throw new ScanException("Invalid character '" + c + "' after backslash"); } } catch (IndexOutOfBoundsException ioobe) { throw new ScanException("Truncated escape sequence"); // SUPPRESS CHECKSTYLE AvoidHidingCause } } return sb.toString(); } }