All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.sax.StandardsText Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.sax;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.tika.sax.StandardReference.StandardReferenceBuilder;

/**
 * StandardText relies on regular expressions to extract standard references
 * from text.
 * 
 * 

* This class helps to find the standard references from text by performing the * following steps: *

    *
  1. searches for headers;
  2. *
  3. searches for patterns that are supposed to be standard references * (basically, every string mostly composed of uppercase letters followed by an * alphanumeric characters);
  4. *
  5. each potential standard reference starts with score equal to 0.25;
  6. *
  7. increases by 0.25 the score of references which include the name of a * known standard organization ({@link StandardOrganizations});
  8. *
  9. increases by 0.25 the score of references which include the word * Publication or Standard;
  10. *
  11. increases by 0.25 the score of references which have been found within * "Applicable Documents" and equivalent sections;
  12. *
  13. returns the standard references along with scores.
  14. *
*

* */ public class StandardsText { // Regular expression to match uppercase headers private static final String REGEX_HEADER = "(\\d+\\.(\\d+\\.?)*)\\p{Blank}+([A-Z]+(\\s[A-Z]+)*){5,}"; // Regular expression to match the "APPLICABLE DOCUMENTS" and equivalent // sections private static final String REGEX_APPLICABLE_DOCUMENTS = "(?i:.*APPLICABLE\\sDOCUMENTS|REFERENCE|STANDARD|REQUIREMENT|GUIDELINE|COMPLIANCE.*)"; // Regular expression to match the alphanumeric identifier of the standard private static final String REGEX_IDENTIFIER = "(?([0-9]{3,}|([A-Z]+(-|_|\\.)?[0-9]{2,}))((-|_|\\.)?[A-Z0-9]+)*)"; // Regular expression to match the standard organization private static final String REGEX_ORGANIZATION = StandardOrganizations.getOrganzationsRegex(); // Regular expression to match the type of publication, often reported // between the name of the standard organization and the standard identifier private static final String REGEX_STANDARD_TYPE = "(\\s(?i:Publication|Standard))"; // Regular expression to match a string that is supposed to be a standard // reference private static final String REGEX_FALLBACK = "\\(?" + "(?[A-Z]\\w+)" + "\\)?((\\s?(?\\/)\\s?)(\\w+\\s)*\\(?" + "(?[A-Z]\\w+)" + "\\)?)?" + REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + REGEX_IDENTIFIER; // Regular expression to match the standard organization within a string // that is supposed to be a standard reference private static final String REGEX_STANDARD = ".*" + REGEX_ORGANIZATION + ".+" + REGEX_ORGANIZATION + "?.*"; /** * Extracts the standard references found within the given text. * * @param text * the text from which the standard references are extracted. * @param threshold * the lower bound limit to be used in order to select only the * standard references with score greater than or equal to the * threshold. For instance, using a threshold of 0.75 means that * only the patterns with score greater than or equal to 0.75 * will be returned. * @return the list of standard references extracted from the given text. */ public static ArrayList extractStandardReferences(String text, double threshold) { Map headers = findHeaders(text); ArrayList standardReferences = findStandards(text, headers, threshold); return standardReferences; } /** * This method helps to find the headers within the given text. * * @param text * the text from which the headers are extracted. * @return the list of headers found within the given text. */ private static Map findHeaders(String text) { Map headers = new TreeMap(); Pattern pattern = Pattern.compile(REGEX_HEADER); Matcher matcher = pattern.matcher(text); while (matcher.find()) { headers.put(matcher.start(), matcher.group()); } return headers; } /** * This method helps to find the standard references within the given text. * * @param text * the text from which the standards references are extracted. * @param headers * the list of headers found within the given text. * @param threshold * the lower bound limit to be used in order to select only the * standard references with score greater than or equal to the * threshold. * @return the list of standard references extracted from the given text. */ private static ArrayList findStandards(String text, Map headers, double threshold) { ArrayList standards = new ArrayList(); double score = 0; Pattern pattern = Pattern.compile(REGEX_FALLBACK); Matcher matcher = pattern.matcher(text); while (matcher.find()) { StandardReferenceBuilder builder = new StandardReference.StandardReferenceBuilder( matcher.group("mainOrganization"), matcher.group("identifier")) .setSecondOrganization(matcher.group("separator"), matcher.group("secondOrganization")); score = 0.25; // increases by 0.25 the score of references which include the name of a known standard organization if (matcher.group().matches(REGEX_STANDARD)) { score += 0.25; } // increases by 0.25 the score of references which include the word "Publication" or "Standard" if (matcher.group().matches(".*" + REGEX_STANDARD_TYPE + ".*")) { score += 0.25; } int startHeader = 0; int endHeader = 0; boolean headerFound = false; Iterator> iterator = headers.entrySet().iterator(); while (iterator.hasNext() && !headerFound) { startHeader = endHeader; endHeader = iterator.next().getKey(); if (endHeader > matcher.start()) { headerFound = true; } } String header = headers.get(startHeader); // increases by 0.25 the score of references which have been found within "Applicable Documents" and equivalent sections if (header != null && headers.get(startHeader).matches(REGEX_APPLICABLE_DOCUMENTS)) { score += 0.25; } builder.setScore(score); if (score >= threshold) { standards.add(builder.build()); } } return standards; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy