com.aliasi.chunk.RegExChunker Maven / Gradle / Ivy
Show all versions of aliasi-lingpipe Show documentation
/*
* LingPipe v. 4.1.0
* Copyright (C) 2003-2011 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package com.aliasi.chunk;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Compilable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.Serializable;
/**
* A RegExChunker
finds chunks that matches regular
* expressions. Specifically, a matcher is created and its {@link
* Matcher#find()} method is used to iterate over matching text
* segments and convert them to chunks.
*
* The behavior of the find method is largely determined by the
* specific instance of {@link Pattern}) on which the chunker is
* based. For more information, see Sun's RegEx
* Tutorial.
*
*
All found chunks will receive a type and score that is specified
* at construction time.
*
*
Warning: Java uses the same regular expression matching
* as Perl. Perl uses a greedy
* strategy for quantifiers, taking something like .*
to
* match as many characters as possible. In constrast, disjunction
* uses a first-match strategy. For example, the regular expression
* ab|abc
will not produce the same chunker as
* abc|ab
; for input abcde
, the former will
* return ab
as a chunk, whereas the latter will return
* abc
. This first-best matching through disjunctions
* takes precedence over any quantifiers applied to the strings.
*
*
Compilation and Serialization
*
* For convenience, this class implements both the util.Compilable
* and java.io.Serializable
interfaces. These both store the
* same thing, namely the string underlying the regex pattern, the chunk type
* and the score. The reconstituted object will also be an instance of this
* class.
*
* @author Bob Carpenter
* @version 3.8
* @since LingPipe2.3
*/
public class RegExChunker implements Chunker, Compilable, Serializable {
static final long serialVersionUID = -8997320544817071938L;
private final Pattern mPattern;
private final String mChunkType;
private final double mChunkScore;
/**
* Construct a chunker based on the specified regular expression,
* producing the specified chunk type and score. The regular
* expression is compiled using the default method {@link
* Pattern#compile(String)}.
*
* @param regex Regular expression for chunks.
* @param chunkType Type for all found chunks.
* @param chunkScore Score for all found chunks.
*/
public RegExChunker(String regex, String chunkType, double chunkScore) {
this(Pattern.compile(regex),chunkType,chunkScore);
}
/**
* Construct a chunker based on the specified regular expression
* pattern, producing the specified chunk type and score.
*
* @param pattern Regular expression patternfor chunks.
* @param chunkType Type for all found chunks.
* @param chunkScore Score for all found chunks.
*/
public RegExChunker(Pattern pattern, String chunkType, double chunkScore) {
mPattern = pattern;
mChunkType = chunkType;
mChunkScore = chunkScore;
}
/**
* Return the chunking of the specified character sequence. Chunkings
* are defined by the behavior of {@link Matcher#find()} as applied
* to the regular expression pattern underlying this chunker.
*
* @param cSeq Character sequence to chunk.
* @return A chunking of the character sequence.
*/
public Chunking chunk(CharSequence cSeq) {
ChunkingImpl result = new ChunkingImpl(cSeq);
Matcher matcher = mPattern.matcher(cSeq);
while (matcher.find()) {
int start = matcher.start();
int end = matcher.end();
Chunk chunk
= ChunkFactory.createChunk(start,end,mChunkType,mChunkScore);
result.add(chunk);
}
return result;
}
/**
* Compiles this regular-expression chunker to the specified
* object output. When read back in, the object will be an
* instance of this class.
*
* @param out Object output to which this chunker is compiled.
* @throws IOException If there is an underlying I/O error during
* the write.
*/
public void compileTo(ObjectOutput out) throws IOException {
out.writeObject(new Externalizer(this));
}
private Object writeReplace() {
return new Externalizer(this);
}
/**
* Return the chunking of the specified character slice.
*
* @param cs Underlying character sequence.
* @param start Index of first character in slice.
* @param end Index of one past the last character in the slice.
* @return The chunking over the specified character slice.
*/
public Chunking chunk(char[] cs, int start, int end) {
return chunk(new String(cs,start,end-start));
}
static class Externalizer extends AbstractExternalizable {
static final long serialVersionUID = -3419191413174871277L;
private final RegExChunker mChunker;
public Externalizer() {
this(null);
}
public Externalizer(RegExChunker chunker) {
mChunker = chunker;
}
@Override
public void writeExternal(ObjectOutput out) throws IOException {
out.writeUTF(mChunker.mPattern.pattern());
out.writeUTF(mChunker.mChunkType);
out.writeDouble(mChunker.mChunkScore);
}
@Override
public Object read(ObjectInput in) throws IOException, ClassNotFoundException {
String pattern = in.readUTF();
String chunkType = in.readUTF();
double score = in.readDouble();
return new RegExChunker(pattern,chunkType,score);
}
}
}