All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.topologi.diffx.load.text.TokenizerByText Maven / Gradle / Ivy

Go to download

docx4j is a library which helps you to work with the Office Open XML file format as used in docx documents, pptx presentations, and xlsx spreadsheets.

There is a newer version: 6.1.2
Show newest version
/*
 * This file is part of the DiffX library.
 *
 * For licensing information please see the file license.txt included in the release.
 * A copy of this licence can also be found at
 *   http://www.opensource.org/licenses/artistic-license-2.0.php
 */
package com.topologi.diffx.load.text;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import com.topologi.diffx.config.TextGranularity;
import com.topologi.diffx.config.WhiteSpaceProcessing;
import com.topologi.diffx.event.TextEvent;
import com.topologi.diffx.event.impl.CharactersEvent;
import com.topologi.diffx.event.impl.IgnorableSpaceEvent;
import com.topologi.diffx.event.impl.SpaceEvent;

/**
 * The tokeniser for characters events.
 * 
 * 

This class is not synchronized. * * @author Christophe Lauret * @version 11 May 2010 */ public final class TokenizerByText implements TextTokenizer { /** * Define the whitespace processing. */ private final WhiteSpaceProcessing whitespace; /** * Creates a new tokenizer. * * @param whitespace the whitespace processing for this tokenizer. * * @throws NullPointerException if the white space processing is not specified. */ public TokenizerByText(WhiteSpaceProcessing whitespace) { if (whitespace == null) throw new NullPointerException("the white space processing must be specified."); this.whitespace = whitespace; } /** * {@inheritDoc} */ public List tokenize(CharSequence seq) { if (seq == null) return null; if (seq.length() == 0) return Collections.emptyList(); int x = TokenizerUtils.getLeadingWhiteSpace(seq); int y = TokenizerUtils.getTrailingWhiteSpace(seq); // no leading or trailing spaces return a singleton in all configurations if (x == 0 && y == 0) { TextEvent e = new CharactersEvent(seq); return Collections.singletonList(e); } // The text node is only white space (white space = trailing space) if (x == seq.length()) { switch (this.whitespace) { case COMPARE: return Collections.singletonList((TextEvent)SpaceEvent.getInstance(seq.toString())); case PRESERVE: return Collections.singletonList((TextEvent)new IgnorableSpaceEvent(seq.toString())); case IGNORE: return Collections.emptyList(); default: } TextEvent e = new CharactersEvent(seq); return Collections.singletonList(e); } // some trailing or leading whitespace, behaviour changes depending on whitespace processing List events = null; switch (this.whitespace) { case COMPARE: events = new ArrayList(1 + (x > 0 ? 1 : 0) + (y > 0 ? 1 : 0)); if (x > 0) { events.add(SpaceEvent.getInstance(seq.subSequence(0, x))); } events.add(new CharactersEvent(seq.subSequence(x, seq.length()-y))); if (y > 0) { events.add(SpaceEvent.getInstance(seq.subSequence(seq.length()-y, seq.length()))); } break; case PRESERVE: events = new ArrayList(1 + (x > 0 ? 1 : 0) + (y > 0 ? 1 : 0)); if (x > 0) { events.add(new IgnorableSpaceEvent(seq.subSequence(0, x))); } events.add(new CharactersEvent(seq.subSequence(x, seq.length()-y))); if (y > 0) { events.add(new IgnorableSpaceEvent(seq.subSequence(seq.length()-y, seq.length()))); } break; case IGNORE: TextEvent e = new CharactersEvent(seq.subSequence(x, seq.length()-y)); events = Collections.singletonList(e); break; default: } return events; } /** * Always TextGranularity.CHARACTER. * * {@inheritDoc} */ public TextGranularity granurality() { return TextGranularity.TEXT; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy