All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nu.validator.xml.CharacterUtil Maven / Gradle / Ivy

Go to download

An HTML-checking library (used by https://html5.validator.nu and the HTML5 facet of the W3C Validator)

There is a newer version: 20.7.2
Show newest version
/*
 * Copyright (c) 2005 Marko Karppinen & Co. LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a 
 * copy of this software and associated documentation files (the "Software"), 
 * to deal in the Software without restriction, including without limitation 
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
 * and/or sell copies of the Software, and to permit persons to whom the 
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 * DEALINGS IN THE SOFTWARE.
 */

package nu.validator.xml;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @version $Id$
 * @author hsivonen
 */
public class CharacterUtil {

    private final static Pattern MINIMAL = Pattern.compile("[^\\x09\\x0A\\x0D\\u0020-\\uFFFD\\uD800-\\uDBFF\\uDC00–\\uDFFF]");

    // FIXME include UTF-16 representations of U+?FFFE and U+?FFFF.
    private final static Pattern PRUDENT = Pattern.compile("[^\\x09\\x0A\\x0D\\u0020-\\uFFFD\\uD800-\\uDBFF\\uDC00–\\uDFFF]|\\uFEFF|[\\x7F-\\x9F]|[\\uFDD0-\\uFDDF]");
    
    public static String scrubCharacterData(CharSequence data) {
        Matcher m = MINIMAL.matcher(data);
        return m.replaceAll("");
    }
    public static String prudentlyScrubCharacterData(CharSequence data) {
        Matcher m = PRUDENT.matcher(data);
        return m.replaceAll("");
    }    
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy