All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.html.DataURISchemeUtil Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.parser.html;

import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.codec.binary.Base64;

import org.apache.tika.mime.MediaType;

/**
 * Not thread safe.  Create a separate util for each thread.
 */
public class DataURISchemeUtil {

    public static String UNSPECIFIED_MEDIA_TYPE = "text/plain;charset=US-ASCII";

    private static Pattern PARSE_PATTERN = Pattern.compile("(?s)data:([^,]*?)(base64)?,(.*)$");
    private static Pattern EXTRACT_PATTERN =
            Pattern.compile("(?s)data:([^,]*?)(base64)?,([^\"\']*)[\"\']");
    private final Matcher parseMatcher = PARSE_PATTERN.matcher("");
    private final Matcher extractMatcher = EXTRACT_PATTERN.matcher("");
    Base64 base64 = new Base64();

    public DataURIScheme parse(String string) throws DataURISchemeParseException {
        parseMatcher.reset(string);
        if (parseMatcher.find()) {
            return build(parseMatcher.group(1), parseMatcher.group(2), parseMatcher.group(3));
        }
        throw new DataURISchemeParseException("Couldn't find expected pattern");
    }

    private DataURIScheme build(String mediaTypeString, String isBase64, String dataString) {
        byte[] data = null;
        //strip out back slashes as you might have in css
        dataString = (dataString != null) ? dataString.replaceAll("\\\\", " ") : dataString;

        if (dataString == null || dataString.length() == 0) {
            data = new byte[0];
        } else if (isBase64 != null) {
            data = base64.decode(dataString);
        } else {
            //TODO: handle encodings
            MediaType mediaType = MediaType.parse(mediaTypeString);
            Charset charset = StandardCharsets.UTF_8;
            if (mediaType.hasParameters()) {
                String charsetName = mediaType.getParameters().get("charset");
                if (charsetName != null && Charset.isSupported(charsetName)) {
                    try {
                        charset = Charset.forName(charsetName);
                    } catch (IllegalCharsetNameException e) {
                        //swallow and default to UTF-8
                    }
                }
            }
            data = dataString.getBytes(charset);
        }
        return new DataURIScheme(mediaTypeString, (isBase64 != null), data);
    }

    /**
     * Extracts DataURISchemes from free text, as in javascript.
     *
     * @param string
     * @return list of extracted DataURISchemes
     */
    public List extract(String string) {
        extractMatcher.reset(string);
        List list = null;
        while (extractMatcher.find()) {
            DataURIScheme dataURIScheme = build(extractMatcher.group(1), extractMatcher.group(2),
                    extractMatcher.group(3));
            if (list == null) {
                list = new ArrayList<>();
            }
            list.add(dataURIScheme);
        }
        return (list == null) ? Collections.EMPTY_LIST : list;
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy