org.apache.tika.sax.StandardsText Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.sax;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.sax.StandardReference.StandardReferenceBuilder;
/**
* StandardText relies on regular expressions to extract standard references
* from text.
*
*
* This class helps to find the standard references from text by performing the
* following steps:
*
* - searches for headers;
* - searches for patterns that are supposed to be standard references
* (basically, every string mostly composed of uppercase letters followed by an
* alphanumeric characters);
* - each potential standard reference starts with score equal to 0.25;
* - increases by 0.25 the score of references which include the name of a
* known standard organization ({@link StandardOrganizations});
* - increases by 0.25 the score of references which include the word
* Publication or Standard;
* - increases by 0.25 the score of references which have been found within
* "Applicable Documents" and equivalent sections;
* - returns the standard references along with scores.
*
*
*
*/
public class StandardsText {
// Regular expression to match uppercase headers
private static final String REGEX_HEADER = "(\\d+\\.(\\d+\\.?)*)\\p{Blank}+([A-Z]+(\\s[A-Z]+)*){5,}";
// Regular expression to match the "APPLICABLE DOCUMENTS" and equivalent
// sections
private static final String REGEX_APPLICABLE_DOCUMENTS = "(?i:.*APPLICABLE\\sDOCUMENTS|REFERENCE|STANDARD|REQUIREMENT|GUIDELINE|COMPLIANCE.*)";
// Regular expression to match the alphanumeric identifier of the standard
private static final String REGEX_IDENTIFIER = "(?([0-9]{3,}|([A-Z]+(-|_|\\.)?[0-9]{2,}))((-|_|\\.)?[A-Z0-9]+)*)";
// Regular expression to match the standard organization
private static final String REGEX_ORGANIZATION = StandardOrganizations.getOrganzationsRegex();
// Regular expression to match the type of publication, often reported
// between the name of the standard organization and the standard identifier
private static final String REGEX_STANDARD_TYPE = "(\\s(?i:Publication|Standard))";
// Regular expression to match a string that is supposed to be a standard
// reference
private static final String REGEX_FALLBACK = "\\(?" + "(?[A-Z]\\w+)"
+ "\\)?((\\s?(?\\/)\\s?)(\\w+\\s)*\\(?" + "(?[A-Z]\\w+)" + "\\)?)?"
+ REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + REGEX_IDENTIFIER;
// Regular expression to match the standard organization within a string
// that is supposed to be a standard reference
private static final String REGEX_STANDARD = ".*" + REGEX_ORGANIZATION + ".+" + REGEX_ORGANIZATION + "?.*";
/**
* Extracts the standard references found within the given text.
*
* @param text
* the text from which the standard references are extracted.
* @param threshold
* the lower bound limit to be used in order to select only the
* standard references with score greater than or equal to the
* threshold. For instance, using a threshold of 0.75 means that
* only the patterns with score greater than or equal to 0.75
* will be returned.
* @return the list of standard references extracted from the given text.
*/
public static ArrayList extractStandardReferences(String text, double threshold) {
Map headers = findHeaders(text);
ArrayList standardReferences = findStandards(text, headers, threshold);
return standardReferences;
}
/**
* This method helps to find the headers within the given text.
*
* @param text
* the text from which the headers are extracted.
* @return the list of headers found within the given text.
*/
private static Map findHeaders(String text) {
Map headers = new TreeMap();
Pattern pattern = Pattern.compile(REGEX_HEADER);
Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
headers.put(matcher.start(), matcher.group());
}
return headers;
}
/**
* This method helps to find the standard references within the given text.
*
* @param text
* the text from which the standards references are extracted.
* @param headers
* the list of headers found within the given text.
* @param threshold
* the lower bound limit to be used in order to select only the
* standard references with score greater than or equal to the
* threshold.
* @return the list of standard references extracted from the given text.
*/
private static ArrayList findStandards(String text, Map headers,
double threshold) {
ArrayList standards = new ArrayList();
double score = 0;
Pattern pattern = Pattern.compile(REGEX_FALLBACK);
Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
StandardReferenceBuilder builder = new StandardReference.StandardReferenceBuilder(
matcher.group("mainOrganization"), matcher.group("identifier"))
.setSecondOrganization(matcher.group("separator"), matcher.group("secondOrganization"));
score = 0.25;
// increases by 0.25 the score of references which include the name of a known standard organization
if (matcher.group().matches(REGEX_STANDARD)) {
score += 0.25;
}
// increases by 0.25 the score of references which include the word "Publication" or "Standard"
if (matcher.group().matches(".*" + REGEX_STANDARD_TYPE + ".*")) {
score += 0.25;
}
int startHeader = 0;
int endHeader = 0;
boolean headerFound = false;
Iterator> iterator = headers.entrySet().iterator();
while (iterator.hasNext() && !headerFound) {
startHeader = endHeader;
endHeader = iterator.next().getKey();
if (endHeader > matcher.start()) {
headerFound = true;
}
}
String header = headers.get(startHeader);
// increases by 0.25 the score of references which have been found within "Applicable Documents" and equivalent sections
if (header != null && headers.get(startHeader).matches(REGEX_APPLICABLE_DOCUMENTS)) {
score += 0.25;
}
builder.setScore(score);
if (score >= threshold) {
standards.add(builder.build());
}
}
return standards;
}
}