org.spdx.compare.LicenseCompareHelper Maven / Gradle / Ivy

Go to download
/**
 * Copyright (c) 2013 Source Auditor Inc.
 * Copyright (c) 2013 Black Duck Software Inc.
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 *
*/
package org.spdx.compare;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.spdx.compare.CompareTemplateOutputHandler.DifferenceDescription;
import org.spdx.licenseTemplate.LicenseTemplateRuleException;
import org.spdx.licenseTemplate.SpdxLicenseTemplateHelper;
import org.spdx.rdfparser.InvalidSPDXAnalysisException;
import org.spdx.rdfparser.license.AnyLicenseInfo;
import org.spdx.rdfparser.license.ConjunctiveLicenseSet;
import org.spdx.rdfparser.license.DisjunctiveLicenseSet;
import org.spdx.rdfparser.license.ExtractedLicenseInfo;
import org.spdx.rdfparser.license.License;
import org.spdx.rdfparser.license.LicenseException;
import org.spdx.rdfparser.license.LicenseInfoFactory;
import org.spdx.rdfparser.license.LicenseParserException;
import org.spdx.rdfparser.license.LicenseSet;
import org.spdx.rdfparser.license.SpdxListedLicense;

import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

/**
 * Primarily a static class of helper functions for comparing two SPDX licenses
 * @author Gary O'Neall
 *
 */
public class LicenseCompareHelper {
	
	protected static final String TOKEN_SPLIT_REGEX = "(^|[^\\s\\.,?'();:\"/]+)((\\s|\\.|,|\\?|'|\"|\\(|\\)|;|:|/|$)+)";
	protected static final Pattern TOKEN_SPLIT_PATTERN = Pattern.compile(TOKEN_SPLIT_REGEX);
	
	protected static final ImmutableSet PUNCTUATION = ImmutableSet.builder()
			.add(".").add(",").add("?").add("\"").add("'").add("(").add(")").add(";").add(":").add("/").build();
	
	// most of these are comments for common programming languages (C style, Java, Ruby, Python)
	protected static final ImmutableSet SKIPPABLE_TOKENS = ImmutableSet.builder()
		.add("//").add("/*").add("*/").add("/**").add("#").add("##")
		.add("*").add("**").add("\"\"\"").add("/").add("=begin").add("=end").build();
	
	protected static final Map NORMALIZE_TOKENS = Maps.newHashMap();
	
	static {
		//TODO: These should be moved to a property file
		NORMALIZE_TOKENS.put("acknowledgment","acknowledgement");   
		NORMALIZE_TOKENS.put("analogue","analog");   
		NORMALIZE_TOKENS.put("analyse","analyze");   
		NORMALIZE_TOKENS.put("artefact","artifact");   
		NORMALIZE_TOKENS.put("authorisation","authorization");   
		NORMALIZE_TOKENS.put("authorised","authorized");   
		NORMALIZE_TOKENS.put("calibre","caliber");   
		NORMALIZE_TOKENS.put("cancelled","canceled");   
		NORMALIZE_TOKENS.put("apitalisations","apitalizations");   
		NORMALIZE_TOKENS.put("catalogue","catalog");   
		NORMALIZE_TOKENS.put("categorise","categorize");   
		NORMALIZE_TOKENS.put("centre","center");   
		NORMALIZE_TOKENS.put("emphasised","emphasized");   
		NORMALIZE_TOKENS.put("favour","favor");   
		NORMALIZE_TOKENS.put("favourite","favorite");   
		NORMALIZE_TOKENS.put("fulfil","fulfill");   
		NORMALIZE_TOKENS.put("fulfilment","fulfillment");   
		NORMALIZE_TOKENS.put("initialise","initialize");   
		NORMALIZE_TOKENS.put("judgment","judgement");   
		NORMALIZE_TOKENS.put("labelling","labeling");   
		NORMALIZE_TOKENS.put("labour","labor");   
		NORMALIZE_TOKENS.put("licence","license");   
		NORMALIZE_TOKENS.put("maximise","maximize");   
		NORMALIZE_TOKENS.put("modelled","modeled");   
		NORMALIZE_TOKENS.put("modelling","modeling");   
		NORMALIZE_TOKENS.put("offence","offense");   
		NORMALIZE_TOKENS.put("optimise","optimize");   
		NORMALIZE_TOKENS.put("organisation","organization");   
		NORMALIZE_TOKENS.put("organise","organize");   
		NORMALIZE_TOKENS.put("practise","practice");   
		NORMALIZE_TOKENS.put("programme","program");   
		NORMALIZE_TOKENS.put("realise","realize");   
		NORMALIZE_TOKENS.put("recognise","recognize");   
		NORMALIZE_TOKENS.put("signalling","signaling");   
		NORMALIZE_TOKENS.put("utilisation","utilization");   
		NORMALIZE_TOKENS.put("whilst","while");   
		NORMALIZE_TOKENS.put("wilful","wilfull");   
		NORMALIZE_TOKENS.put("non-commercial","noncommercial");    
		NORMALIZE_TOKENS.put("copyright-owner", "copyright-holder");
		NORMALIZE_TOKENS.put("sublicense", "sub-license");
		NORMALIZE_TOKENS.put("non-infringement", "noninfringement");
		NORMALIZE_TOKENS.put("©", "(c)");
		NORMALIZE_TOKENS.put("copyright", "(c)");
		NORMALIZE_TOKENS.put("\"", "'");
	}
	
	
	static final String DASHES_REGEX = "[\\u2012\\u2013\\u2014\\u2015]";
	static final String PER_CENT_REGEX = "(?i)per\\scent";
	static final Pattern PER_CENT_PATTERN = Pattern.compile(PER_CENT_REGEX, Pattern.CASE_INSENSITIVE);
	static final String COPYRIGHT_HOLDER_REGEX = "(?i)copyright\\sholder";
	static final Pattern COPYRIGHT_HOLDER_PATTERN = Pattern.compile(COPYRIGHT_HOLDER_REGEX, Pattern.CASE_INSENSITIVE);
	static final String COPYRIGHT_OWNER_REGEX = "(?i)copyright\\sowner";
	static final Pattern COPYRIGHT_OWNER_PATTERN = Pattern.compile(COPYRIGHT_OWNER_REGEX, Pattern.CASE_INSENSITIVE);
	
	//TODO: Add equiv for quotes
	/**
	 * Returns true if two sets of license text is considered a match per
	 * the SPDX License matching guidelines documented at spdx.org (currently http://spdx.org/wiki/spdx-license-list-match-guidelines)
	 * There are 2 unimplemented features - bullets/numbering is not considered and comments with no whitespace between text is not skipped
	 * @param licenseTextA
	 * @param licenseTextB
	 * @return
	 */
	public static boolean isLicenseTextEquivalent(String licenseTextA, String licenseTextB) {
		//TODO: Handle comment characters without white space before text
		//TODO: Handle bullets and numbering
		// Need to take care of multi-word equivalent words - convert to single words with hypens
		
		// tokenize each of the strings
		if (licenseTextA == null) {
			return (licenseTextB == null || licenseTextB.isEmpty());
		}
		if (licenseTextB == null) {
			return licenseTextA.isEmpty();
		}
		if (licenseTextA.equals(licenseTextB)) {
			return true;
		}
		Map tokenToLocationA = new HashMap();
		Map tokenToLocationB = new HashMap();
		String[] licenseATokens = tokenizeLicenseText(licenseTextA,tokenToLocationA);
		String[] licenseBTokens = tokenizeLicenseText(licenseTextB,tokenToLocationB);
		int bTokenCounter = 0;
		int aTokenCounter = 0;
		String nextAToken = getTokenAt(licenseATokens, aTokenCounter++);
		String nextBToken = getTokenAt(licenseBTokens, bTokenCounter++);
		while (nextAToken != null) {
			if (nextBToken == null) {
				// end of b stream
				while (nextAToken != null && canSkip(nextAToken)) {
					nextAToken = getTokenAt(licenseATokens, aTokenCounter++);
				}
				if (nextAToken != null) {
					return false;	// there is more stuff in the license text B, so not equal
				}
			} else if (tokensEquivalent(nextAToken, nextBToken)) { 
				// just move onto the next set of tokens
				nextAToken = getTokenAt(licenseATokens, aTokenCounter++);
				nextBToken = getTokenAt(licenseBTokens, bTokenCounter++);
			} else {
				// see if we can skip through some B tokens to find a match
				while (nextBToken != null && canSkip(nextBToken)) {
					nextBToken = getTokenAt(licenseBTokens, bTokenCounter++);
				}
				// just to be sure, skip forward on the A license
				while (nextAToken != null && canSkip(nextAToken)) {
					nextAToken = getTokenAt(licenseATokens, aTokenCounter++);
				}
				if (!tokensEquivalent(nextAToken, nextBToken)) {
					return false;
				} else {
					nextAToken = getTokenAt(licenseATokens, aTokenCounter++);
					nextBToken = getTokenAt(licenseBTokens, bTokenCounter++);
				}
			}
		}
		// need to make sure B is at the end
		while (nextBToken != null && canSkip(nextBToken)) {
			nextBToken = getTokenAt(licenseBTokens, bTokenCounter++);
		}
		return (nextBToken == null);
	}
	
	/**
	 * Normalize quotes and no-break spaces
	 * @param s
	 * @return
	 */
	static String normalizeText(String s) {
		// First normalize single quotes, then normalize two single quotes to a double quote, normalize double quotes 
		// then normalize non-breaking spaces to spaces
		return s.replaceAll("‘|’|‛|‚|`", "'")	// Take care of single quotes first
				.replaceAll("''","\"")			// This way, we can change doulbe single quotes to a single double cquote
				.replaceAll("“|”|‟|„", "\"")	// Now we can normalize the double quotes
				.replaceAll("\\u00A0", " ")		// replace non-breaking spaces with spaces since Java does not handle the former well
				.replaceAll("\\u2028", "\n");	// replace line separator with newline since Java does not handle the former well
	}
	
	/**
	 * Locate the original text starting with the start token and ending with the end token
	 * @param fullLicenseText
	 * @param startToken
	 * @param endToken
	 * @param tokenToLocation
	 * @return
	 */
	public static String locateOriginalText(String fullLicenseText, int startToken, int endToken,  
			Map tokenToLocation, String[] tokens) {
		if (startToken > endToken) {
			return "";
		}
		LineColumn start = tokenToLocation.get(startToken);
		if (start == null) {
			return "";
		}
		LineColumn end = tokenToLocation.get(endToken);
		// If end == null, then we read to the end
		BufferedReader reader = null;
		try {
			reader = new BufferedReader(new StringReader(fullLicenseText));
			int currentLine = 1;
			String line = reader.readLine();
			while (line != null && currentLine < start.getLine()) {
				currentLine++;
				line = reader.readLine();
			}
			if (line == null) {
				return "";
			}
			if (end == null) {
				// read until the end of the stream
				StringBuilder sb = new StringBuilder(line.substring(start.getColumn(), line.length()));
				currentLine++;
				line = reader.readLine();
				while (line != null) {
					sb.append(line);
					currentLine++;
					line = reader.readLine();
				}
				return sb.toString();
			} else if (end.getLine() == currentLine) {
				return line.substring(start.getColumn(), end.getColumn()+end.getLen());
			} else {
				StringBuilder sb = new StringBuilder(line.substring(start.getColumn(), line.length()));
				currentLine++;
				line = reader.readLine();
				while (line != null && currentLine < end.getLine()) {
					sb.append("\n");
					sb.append(line);
					currentLine++;
					line = reader.readLine();
				}
				if (line != null && end.getColumn()+end.getLen() > 0) {
					sb.append("\n");
					sb.append(line.substring(0, end.getColumn()+end.getLen()));
				}
				return sb.toString();
			}			
		} catch (IOException e) {
			// just build with spaces - not ideal, but close enough most of the time
			StringBuilder sb = new StringBuilder(tokens[startToken]);
			for (int i = startToken+1; i <= endToken; i++) {
				sb.append(' ');
				sb.append(tokens[i]);
			}
			return sb.toString();
		} finally {
			if (reader != null) {
				try {
					reader.close();
				} catch (IOException e) {
					// ignore
				}
			}
		}
	}
	
	/**
	 * Tokenizes the license text, normalizes quotes, lowercases and converts multi-words for better equiv. comparisons
	 * @param tokenLocations location for all of the tokens
	 * @param licenseText
	 * @return
	 * @throws IOException 
	 */
	public static String[] tokenizeLicenseText(String licenseText, Map tokenToLocation) {
		String textToTokenize = normalizeText(replaceMultWord(licenseText)).toLowerCase();
		List tokens = new ArrayList();
		BufferedReader reader = null;
		try {
			reader = new BufferedReader(new StringReader(textToTokenize));
			int currentLine = 1;
			int currentToken = 0;
			String line = reader.readLine();
			while (line != null) {
				Matcher lineMatcher = TOKEN_SPLIT_PATTERN.matcher(line);
				while (lineMatcher.find()) {
					String token = lineMatcher.group(1).trim();
					if (!token.isEmpty()) {
						tokens.add(token);
						tokenToLocation.put(currentToken, new LineColumn(currentLine, lineMatcher.start(), token.length()));
						currentToken++;
					}
					String fullMatch = lineMatcher.group(0);
					for (int i = lineMatcher.group(1).length(); i < fullMatch.length(); i++) {
						String possiblePunctuation = fullMatch.substring(i, i+1);
						if (PUNCTUATION.contains(possiblePunctuation)) {
							tokens.add(possiblePunctuation);
							tokenToLocation.put(currentToken, new LineColumn(currentLine, lineMatcher.start()+i, 1));
							currentToken++;
						}
					}
				}
				currentLine++;
				line = reader.readLine();
			}
		} catch (IOException e) {
			// Don't fill in the lines, take a simpler approach
			Matcher m = TOKEN_SPLIT_PATTERN.matcher(textToTokenize);
			while (m.find()) {
				String word = m.group(1).trim();
				String seperator = m.group(2).trim();
				tokens.add(word);
				if (PUNCTUATION.contains(seperator)) {
					tokens.add(seperator);
				}
			}
		} finally {
			if (reader != null) {
				try {
					reader.close();
				} catch (IOException e) {
					// ignore
				}
			}
		}
		return tokens.toArray(new String[tokens.size()]);
	}
	
	/**
	 * @param text
	 * @return the first token in the license text
	 */
	public static String getFirstLicenseToken(String text) {
		String textToTokenize = normalizeText(replaceMultWord(text)).toLowerCase();
		Matcher m = TOKEN_SPLIT_PATTERN.matcher(textToTokenize);
		while (m.find()) {
			if (!m.group(1).trim().isEmpty()) {
				return m.group(1).trim();
			}
		}
		return null;
	}
	
	/**
	 * @param text
	 * @return true if the text contains a single token
	 */
	public static boolean isSingleTokenString(String text) {
		if (text.contains("\n")) {
			return false;
		}
		Matcher m = TOKEN_SPLIT_PATTERN.matcher(text);
		boolean found = false;
		while (m.find()) {
			if (!m.group(1).trim().isEmpty()) {
				if (found) {
					return false;
				} else {
					found = true;
				}
			}
		}
		return true;
	}

	/**
	 * replaces all mult-words with a single token using a dash to separate
	 * @param s
	 * @return
	 */
	private static String replaceMultWord(String s) {
		Matcher m = COPYRIGHT_HOLDER_PATTERN.matcher(s);
		String retval = m.replaceAll("copyright-holder");
		m = COPYRIGHT_OWNER_PATTERN.matcher(retval);
		retval = m.replaceAll("copyright-owner");
		m = PER_CENT_PATTERN.matcher(retval);
		retval = m.replaceAll("percent");
		return retval;
	}
	
	/**
	 * Just fetches the string at the index checking for range.  Returns null if index is out of range.
	 * @param tokens
	 * @param tokenIndex
	 * @return
	 */
	static String getTokenAt(String[] tokens, int tokenIndex) {
		if (tokenIndex >= tokens.length) {
			return null;
		} else {
			return tokens[tokenIndex];
		}
	}
	/**
	 * Returns true if the two tokens can be considered equlivalent per the SPDX license matching rules
	 * @param tokenA
	 * @param tokenB
	 * @return
	 */
	static boolean tokensEquivalent(String tokenA, String tokenB) {
		if (tokenA == null) {
			if (tokenB == null) {
				return true;
			} else {
				return false;
			}
		} else if (tokenB == null) {
			return false;
		} else {
			String s1 = tokenA.trim().toLowerCase().replaceAll(DASHES_REGEX, "-");
			String s2 = tokenB.trim().toLowerCase().replaceAll(DASHES_REGEX, "-");
			if (s1.equals(s2)) {
				return true;
			} else {
				// check for equivalent tokens by normalizing the tokens
				String ns1 = NORMALIZE_TOKENS.get(s1);
				if (ns1 == null) {
					ns1 = s1;
				}
				String ns2 = NORMALIZE_TOKENS.get(s2);
				if (ns2 == null) {
					ns2 = s2;
				}
				return ns1.equals(ns2);
			}
		}
	}
	/**
	 * Returns true if the token can be ignored per the rules
	 * @param token
	 * @return
	 */
	static boolean canSkip(String token) {
		if (token == null) {
			return false;
		}
		if (token.trim().isEmpty()) {
			return true;
		}
		return SKIPPABLE_TOKENS.contains(token.trim().toLowerCase());
	}

	/**
	 * Compares two licenses from potentially two different documents which may have
	 * different license ID's for the same license
	 * @param license1
	 * @param license2
	 * @param xlationMap Mapping the license ID's from license 1 to license 2
	 * @return
	 * @throws SpdxCompareException 
	 */
	public static boolean isLicenseEqual(AnyLicenseInfo license1,
			AnyLicenseInfo license2, Map xlationMap) throws SpdxCompareException {
		if (license1 instanceof ConjunctiveLicenseSet) {
			if (!(license2 instanceof ConjunctiveLicenseSet)) {
				return false;
			} else {
				return isLicenseSetsEqual((ConjunctiveLicenseSet)license1,
						(ConjunctiveLicenseSet)license2, xlationMap);
			}
		} else if (license1 instanceof DisjunctiveLicenseSet) {
			if (!(license2 instanceof DisjunctiveLicenseSet)) {
				return false;
			} else {
				return isLicenseSetsEqual((DisjunctiveLicenseSet)license1,
						(DisjunctiveLicenseSet)license2, xlationMap);
			}
		} else if (license1 instanceof ExtractedLicenseInfo) {
			if (!(license2 instanceof ExtractedLicenseInfo)) {
				return false;
			} else {
				String licenseid1 = ((ExtractedLicenseInfo)license1).getLicenseId();
				String licenseid2 = ((ExtractedLicenseInfo)license2).getLicenseId();
				String xlatedLicenseId = xlationMap.get(licenseid1);
				if (xlatedLicenseId == null) {
					return false;	// no equivalent license was found
				}
				return xlatedLicenseId.equals(licenseid2);
			}
		} else {
            return license1.equals(license2);
        }
	}

	/**
	 * Compares two license sets using the xlationMap for the non-standard license IDs
	 * @param license1
	 * @param license2
	 * @return
	 * @throws SpdxCompareException 
	 */
	private static boolean isLicenseSetsEqual(LicenseSet license1, LicenseSet license2, Map xlationMap) throws SpdxCompareException {
		// note - order does not matter
		AnyLicenseInfo[] licenseInfos1 = license1.getMembers();
		AnyLicenseInfo[] licenseInfos2 = license2.getMembers();
		if (licenseInfos1 == null) {
			return licenseInfos2 == null;
		}
		if (licenseInfos2 == null) {
			return false;
		}
		if (licenseInfos1.length != licenseInfos2.length) {
			return false;
		}
		for (int i = 0; i < licenseInfos1.length; i++) {
			boolean found = false;
			for (int j = 0; j < licenseInfos2.length; j++) {
				if (isLicenseEqual(licenseInfos1[i], licenseInfos2[j], xlationMap)) {
					found = true;
					break;
				}
			}
			if (!found) {
				return false;
			}
		}
		return true;
	}
	
	/**
	 * Compares license text to the license text of an SPDX Standard License
	 * @param license SPDX Standard License to compare
	 * @param compareText Text to compare to the standard license
	 * @return any differences found
	 * @throws SpdxCompareException
	 */
	public static DifferenceDescription isTextStandardLicense(License license, String compareText) throws SpdxCompareException {
		String licenseTemplate = license.getStandardLicenseTemplate();
		if (licenseTemplate == null || licenseTemplate.trim().isEmpty()) {
			licenseTemplate = license.getLicenseText();
		}
		CompareTemplateOutputHandler compareTemplateOutputHandler = null;
		try {
			compareTemplateOutputHandler = new CompareTemplateOutputHandler(compareText);
		} catch (IOException e1) {
			throw(new SpdxCompareException("IO Error reading the compare text: "+e1.getMessage(),e1));
		}
		try {
			SpdxLicenseTemplateHelper.parseTemplate(licenseTemplate, compareTemplateOutputHandler);
		} catch (LicenseTemplateRuleException e) {
			throw(new SpdxCompareException("Invalid template rule found during compare: "+e.getMessage(),e));
		} catch (LicenseParserException e) {
			throw(new SpdxCompareException("Invalid template found during compare: "+e.getMessage(),e));
		}
		return compareTemplateOutputHandler.getDifferences();
	}
	
	/**
	 * Compares exception text to the exception text of an SPDX Standard exception
	 * @param exception SPDX Standard exception to compare
	 * @param compareText Text to compare to the standard exceptions
	 * @return any differences found
	 * @throws SpdxCompareException
	 */
	public static DifferenceDescription isTextStandardException(LicenseException exception, String compareText) throws SpdxCompareException {
		String exceptionTemplate = exception.getLicenseExceptionTemplate();
		if (exceptionTemplate == null || exceptionTemplate.trim().isEmpty()) {
			exceptionTemplate = exception.getLicenseExceptionText();
		}
		CompareTemplateOutputHandler compareTemplateOutputHandler = null;
		try {
			compareTemplateOutputHandler = new CompareTemplateOutputHandler(compareText);
		} catch (IOException e1) {
			throw(new SpdxCompareException("IO Error reading the compare text: "+e1.getMessage(),e1));
		}
		try {
			SpdxLicenseTemplateHelper.parseTemplate(exceptionTemplate, compareTemplateOutputHandler);
		} catch (LicenseTemplateRuleException e) {
			throw(new SpdxCompareException("Invalid template rule found during compare: "+e.getMessage(),e));
		} catch (LicenseParserException e) {
			throw(new SpdxCompareException("Invalid template found during compare: "+e.getMessage(),e));
		}
		return compareTemplateOutputHandler.getDifferences();
	}
	
	/**
	 * Returns a list of SPDX Standard License ID's that match the text provided using
	 * the SPDX matching guidelines.
	 * @param licenseText Text to compare to the standard license texts
	 * @return Array of SPDX standard license IDs that match
	 * @throws InvalidSPDXAnalysisException If an error occurs accessing the standard licenses
	 * @throws SpdxCompareException If an error occurs in the comparison
	 */
	public static String[] matchingStandardLicenseIds(String licenseText) throws InvalidSPDXAnalysisException, SpdxCompareException {
		String[] stdLicenseIds = LicenseInfoFactory.getSpdxListedLicenseIds();
		List matchingIds  = Lists.newArrayList();
		for (String stdLicId : stdLicenseIds) {
			SpdxListedLicense license = LicenseInfoFactory.getListedLicenseById(stdLicId);
			if (!isTextStandardLicense(license, licenseText).isDifferenceFound()) {
				matchingIds.add(license.getLicenseId());
			}
		}
		return matchingIds.toArray(new String[matchingIds.size()]);
	}
}