org.apache.tika.utils.RegexUtils Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.utils;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Inspired from Nutch code class OutlinkExtractor. Apply regex to extract
 * content
 * 
 * 
 */
public class RegexUtils {

    /**
     * Regex pattern to get URLs within a plain text.
     * 
     * @see http://www.truerwords.net/articles/ut/urlactivation.html
     *      
     */
    private static final String LINKS_REGEX =
        "([A-Za-z][A-Za-z0-9+.-]{1,120}:"
        + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}"
        + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
    
    private static final Pattern LINKS_PATTERN = Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE);

    /**
     * Extract urls from plain text.
     *
     * @param content The plain text content to examine
     * @return List of urls within found in the plain text
     */
    public static List extractLinks(String content) {
        if (content == null || content.length() == 0) {
            return Collections.emptyList();
        }

        List extractions = new ArrayList();
        final Matcher matcher = LINKS_PATTERN.matcher(content);
        while (matcher.find()) {
            extractions.add(matcher.group());
        }
        return extractions;

    }
}