All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.corpus.Parser Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.corpus;

import com.aliasi.util.Files;
import com.aliasi.util.Strings;

import org.xml.sax.InputSource;

import java.io.File;
import java.io.IOException;

/**
 * The Parser abstract class provides methods for parsing
 * content from an input source or character sequence and passing
 * extracted events to a content handler.  Concrete implementations will
 * typically make assumptions about the type of the handler.
 *
 * 

Concrete subclasses must implement both {@link * #parse(InputSource)} and {@link #parseString(char[],int,int)}. Two * subclasses of this class, {@link InputSourceParser} and {@link * StringParser} may be extended by implementing only one of the * above methods. * * @author Bob Carpenter * @version 4.0.1 * @since LingPipe2.1 * @param the type of handler which receives events from this parser */ public abstract class Parser { private H mHandler; /** * Construct a parser with a null handler. */ public Parser() { this(null); } /** * Construct a parser with the specified handler. * * @param handler Current handler. */ public Parser(H handler) { mHandler = handler; } /** * Sets the content handler to the specified value. The current * handler is used for all content extracted by this parser. * * @param handler Handler to use for content extracted from parsed * content. */ public void setHandler(H handler) { mHandler = handler; } /** * Returns the current content handler. The current handler is * applied to all extracted content. * * @return Current content handler. */ public H getHandler() { return mHandler; } /** * Parse the specified system identifier, usually a URL name, * passing extracted events to the handler. * *

The implementation provided by this abstract class * constructs an input source from the system identifier * and passes it to {@link #parse(InputSource)}. * *

Warning: This method does not specify a character * encoding. Behavior will depend on the implementation. * See {@link #parse(String,String)} to set the encoding. * * @param sysId System ID from which to read. * @throws IOException If there is an exception reading * from the specified source. */ public void parse(String sysId) throws IOException { InputSource in = new InputSource(sysId); parse(in); } /** * Parse the specified system identifier, usually a URL name, * using the specified character encoding, passing extracted * events to the handler. * *

Warning: The character encoding is set on the * constructed input source, but this may be ignored by the * parser. * * @param sysId System identifier for data, typically a URL name. * @param charEncoding Character encoding to use for the data. */ public void parse(String sysId, String charEncoding) throws IOException { InputSource in = new InputSource(sysId); in.setEncoding(charEncoding); parse(in); } /** * Parse the specified file, passing extracted events to the * handler. * *

The file is converted to a URL name using * {@code File.toURI().toURL().toString()}. It is then used * to construct an input source which is parsed using * the {@code #parse(InputSource)} method. * *

Warning: This method does not specify a character * encoding on the constructed input source. Behavior will depend * on the implementation. See {@link #parse(File,String)} to set * the character encoding. * * @param file File to parse. * @throws IOException If there is an exception reading * from the specified file or it does not exist. */ public void parse(File file) throws IOException { parse(file.toURI().toURL().toString()); } /** * Parse the specified file using the specified character * encoding, passing events to the handler. * *

The file is converted to a URL name using * {@code File.toURI().toURL().toString()}. * *

Warning: The character encoding will be set on the * input source for parsing, but parser implementations may * choose to ignore it. * * @param file File from which to read. * @param charEncoding Characte encoding for text. * @throws IOException If there is an exception reading * from the specified source. */ public void parse(File file, String charEncoding) throws IOException { String urlName = file.toURI().toURL().toString(); InputSource in = new InputSource(urlName); in.setEncoding(charEncoding); parse(in); } /** * Parse the specified character sequence. Extracted content * is passed to the current handler. * *

The character sequence is converted to a character array * using {@link Strings#toCharArray(CharSequence)} and then * passed as a slice to to {@link #parseString(char[],int,int)}. * * @param cSeq Character sequence to parse. * @throws IOException If there is an exception reading the * characters. */ public void parseString(CharSequence cSeq) throws IOException { char[] cs = Strings.toCharArray(cSeq); parseString(cs,0,cs.length); } /** * Parse the specified input source, passing extracted events to * the handler. Concrete subclasses must implement this method. * * @param in Input source from which to read. * @throws IOException If there is an exception reading from the * specified stream. */ abstract public void parse(InputSource in) throws IOException; /** * Parse the specified character slice as a string input. Extracted * content is passed to the current handler. * * @param cs Characters underlying slice. * @param start Index of first character in slice. * @param end One past the index of the last character in slice. * @throws IOException If there is an exception reading the * characters. */ abstract public void parseString(char[] cs, int start, int end) throws IOException; }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy