All Downloads are FREE. Search and download functionalities are using the official Maven repository.

package.rewrite-pattern.js Maven / Gradle / Ivy

Go to download

regexpu’s core functionality (i.e. `rewritePattern(pattern, flag)`), capable of translating ES6 Unicode regular expressions to ES5.

The newest version!
'use strict';

const generate = require('regjsgen').generate;
const parse = require('regjsparser').parse;
const regenerate = require('regenerate');
const unicodeMatchProperty = require('unicode-match-property-ecmascript');
const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
const iuMappings = require('./data/iu-mappings.js');
const iBMPMappings = require('./data/i-bmp-mappings.js');
const iuFoldings = require('./data/iu-foldings.js');
const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
const { UNICODE_SET, UNICODE_IV_SET } = require('./data/all-characters.js');

function flatMap(array, callback) {
	const result = [];
	array.forEach(item => {
		const res = callback(item);
		if (Array.isArray(res)) {
			result.push.apply(result, res);
		} else {
			result.push(res);
		}
	});
	return result;
}

function regenerateContainsAstral(regenerateData) {
	const data = regenerateData.data;
	return data.length >= 1 && data[data.length - 1] >= 0x10000;
}

// https://tc39.es/ecma262/#prod-SyntaxCharacter
const SYNTAX_CHARS = /[\\^$.*+?()[\]{}|]/g;

const ASTRAL_SET = regenerate().addRange(0x10000, 0x10FFFF);

const NEWLINE_SET = regenerate().add(
	// `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
	0x000A, // Line Feed 
	0x000D, // Carriage Return 
	0x2028, // Line Separator 
	0x2029  // Paragraph Separator 
);

// Prepare a Regenerate set containing all code points that are supposed to be
// matched by `/./u`. https://mths.be/es6#sec-atom
const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
	.remove(NEWLINE_SET);

const getCharacterClassEscapeSet = (character, unicode, ignoreCase, shouldApplySCF) => {
	if (unicode) {
		if (ignoreCase) {
			const result = ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
			if (shouldApplySCF) {
				return ESCAPE_SETS.UNICODESET_IGNORE_CASE.get(character);
			} else {
				return result;
			}
		}
		return ESCAPE_SETS.UNICODE.get(character);
	}
	return ESCAPE_SETS.REGULAR.get(character);
};

const getUnicodeDotSet = (dotAll) => {
	return dotAll ? UNICODE_SET : DOT_SET_UNICODE;
};

const getUnicodePropertyValueSet = (property, value) => {
	const path = value ?
		`${ property }/${ value }` :
		`Binary_Property/${ property }`;
	try {
		return require(`regenerate-unicode-properties/${ path }.js`);
	} catch (exception) {
		throw new Error(
			`Failed to recognize value \`${ value }\` for property ` +
			`\`${ property }\`.`
		);
	}
};

const handleLoneUnicodePropertyNameOrValue = (value) => {
	// It could be a `General_Category` value or a binary property.
	// Note: `unicodeMatchPropertyValue` throws on invalid values.
	try {
		const property = 'General_Category';
		const category = unicodeMatchPropertyValue(property, value);
		return getUnicodePropertyValueSet(property, category);
	} catch (exception) {}
	// It’s not a `General_Category` value, so check if it’s a property
	// of strings.
	try {
		return getUnicodePropertyValueSet('Property_of_Strings', value);
	} catch (exception) {}
	// Lastly, check if it’s a binary property of single code points.
	// Note: `unicodeMatchProperty` throws on invalid properties.
	const property = unicodeMatchProperty(value);
	return getUnicodePropertyValueSet(property);
};

const getUnicodePropertyEscapeSet = (value, isNegative, isUnicodeSetIgnoreCase) => {
	const parts = value.split('=');
	const firstPart = parts[0];
	let set;
	if (parts.length == 1) {
		set = handleLoneUnicodePropertyNameOrValue(firstPart);
	} else {
		// The pattern consists of two parts, i.e. `Property=Value`.
		const property = unicodeMatchProperty(firstPart);
		const value = unicodeMatchPropertyValue(property, parts[1]);
		set = getUnicodePropertyValueSet(property, value);
	}
	if (isNegative) {
		if (set.strings) {
			throw new Error('Cannot negate Unicode property of strings');
		}
		return {
			characters: (isUnicodeSetIgnoreCase ? UNICODE_IV_SET : UNICODE_SET).clone().remove(set.characters),
			strings: new Set()
		};
	}
	return {
		characters: set.characters.clone(),
		strings: set.strings
			// We need to escape strings like *️⃣ to make sure that they can be safely used in unions.
			? new Set(set.strings.map(str => str.replace(SYNTAX_CHARS, '\\$&')))
			: new Set()
	};
};

const getUnicodePropertyEscapeCharacterClassData = (property, isNegative, isUnicodeSetIgnoreCase, shouldApplySCF) => {
	const set = getUnicodePropertyEscapeSet(property, isNegative, isUnicodeSetIgnoreCase);
	const data = getCharacterClassEmptyData();
	const singleChars = shouldApplySCF ? regenerate(set.characters.toArray().map(ch => simpleCaseFolding(ch))) : set.characters;
	const caseEqFlags = configGetCaseEqFlags();
	if (caseEqFlags) {
		for (const codepoint of singleChars.toArray()) {
			const list = getCaseEquivalents(codepoint, caseEqFlags);
			if (list) {
				singleChars.add(list);
			}
		}
	}
	data.singleChars = singleChars;
	if (set.strings.size > 0) {
		data.longStrings = set.strings;
		data.maybeIncludesStrings = true;
	}
	return data;
};

const CASE_EQ_FLAG_NONE = 0b00;
const CASE_EQ_FLAG_BMP = 0b01;
const CASE_EQ_FLAG_UNICODE = 0b10;

function configGetCaseEqFlags() {
	let flags = CASE_EQ_FLAG_NONE;
	if (config.modifiersData.i === true) {
		if (config.transform.modifiers) {
			flags |= CASE_EQ_FLAG_BMP;
			if (config.flags.unicode || config.flags.unicodeSets) {
				flags |= CASE_EQ_FLAG_UNICODE;
			}
		}
	} else if (config.modifiersData.i === undefined) {
		if (config.transform.unicodeFlag && config.flags.ignoreCase) {
			flags |= CASE_EQ_FLAG_UNICODE;
		}
	}
	return flags;
}

// Given a range of code points, add any case-equivalent code points in that range
// to a set.
regenerate.prototype.iuAddRange = function(min, max, caseEqFlags) {
	const $this = this;
	do {
		const list = getCaseEquivalents(min, caseEqFlags);
		if (list) {
			$this.add(list);
		}
	} while (++min <= max);
	return $this;
};
regenerate.prototype.iuRemoveRange = function(min, max, caseEqFlags) {
	const $this = this;
	do {
		const list = getCaseEquivalents(min, caseEqFlags);
		if (list) {
			$this.remove(list);
		}
	} while (++min <= max);
	return $this;
};

const update = (item, pattern) => {
	let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '', {
		lookbehind: true,
		namedGroups: true,
		unicodePropertyEscape: true,
		unicodeSet: true,
		modifiers: true,
	});
	switch (tree.type) {
		case 'characterClass':
		case 'group':
		case 'value':
			// No wrapping needed.
			break;
		default:
			// Wrap the pattern in a non-capturing group.
			tree = wrap(tree, pattern);
	}
	Object.assign(item, tree);
};

const wrap = (tree, pattern) => {
	// Wrap the pattern in a non-capturing group.
	return {
		'type': 'group',
		'behavior': 'ignore',
		'body': [tree],
		'raw': `(?:${ pattern })`
	};
};

/**
 * Given any codepoint ch, returns false or an array of characters,
 * such that for every c in the array,
 *   c != ch and Canonicalize(~, c) == Canonicalize(~, ch)
 * 
 * where Canonicalize is defined in
 * https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
 * @param {number} codePoint input code point
 * @param {number} flags bitwise flags composed of CASE_EQ_FLAG_*
 * @returns false | number[]
 */
const getCaseEquivalents = (codePoint, flags) => {
	if (flags === CASE_EQ_FLAG_NONE) {
		return false;
	}
	let result = ((flags & CASE_EQ_FLAG_UNICODE) ? iuMappings.get(codePoint) : undefined) || [];
	if (typeof result === "number") result = [result];
	if (flags & CASE_EQ_FLAG_BMP) {
		for (const cp of [codePoint].concat(result)) {
			// Fast path for ASCII characters
			if (cp >= 0x41 && cp <= 0x5a) {
				result.push(cp + 0x20);
			} else if (cp >= 0x61 && cp <= 0x7a) {
				result.push(cp - 0x20);
			} else {
				result = result.concat(iBMPMappings.get(cp) || []);
			}
		}
	}
	return result.length == 0 ? false : result;
};

// https://tc39.es/ecma262/#sec-maybesimplecasefolding
const simpleCaseFolding = (codePoint) => {
	// Fast path for ASCII characters
	if (codePoint <= 0x7F) {
		if (codePoint >= 0x41 && codePoint <= 0x5A) {
			return codePoint + 0x20;
		}
		return codePoint;
	}
	return iuFoldings.get(codePoint) || codePoint;
}

const buildHandler = (action) => {
	switch (action) {
		case 'union':
			return {
				single: (data, cp) => {
					data.singleChars.add(cp);
				},
				regSet: (data, set2) => {
					data.singleChars.add(set2);
				},
				range: (data, start, end) => {
					data.singleChars.addRange(start, end);
				},
				iuRange: (data, start, end, caseEqFlags) => {
					data.singleChars.iuAddRange(start, end, caseEqFlags);
				},
				nested: (data, nestedData) => {
					data.singleChars.add(nestedData.singleChars);
					for (const str of nestedData.longStrings) data.longStrings.add(str);
					if (nestedData.maybeIncludesStrings) data.maybeIncludesStrings = true;
				}
			};
		case 'union-negative': {
			const regSet = (data, set2) => {
				data.singleChars = UNICODE_SET.clone().remove(set2).add(data.singleChars);
			};
			return {
				single: (data, cp) => {
					const unicode = UNICODE_SET.clone();
					data.singleChars = data.singleChars.contains(cp) ? unicode : unicode.remove(cp);
				},
				regSet: regSet,
				range: (data, start, end) => {
					data.singleChars = UNICODE_SET.clone().removeRange(start, end).add(data.singleChars);
				},
				iuRange: (data, start, end, caseEqFlags) => {
					data.singleChars = UNICODE_SET.clone().iuRemoveRange(start, end, caseEqFlags).add(data.singleChars);
				},
				nested: (data, nestedData) => {
					regSet(data, nestedData.singleChars);
					if (nestedData.maybeIncludesStrings) throw new Error('ASSERTION ERROR');
				}
			};
		}
		case 'intersection': {
			const regSet = (data, set2) => {
				if (data.first) data.singleChars = set2;
				else data.singleChars.intersection(set2);
			};
			return {
				single: (data, cp) => {
					data.singleChars = data.first || data.singleChars.contains(cp) ? regenerate(cp) : regenerate();
					data.longStrings.clear();
					data.maybeIncludesStrings = false;
				},
				regSet: (data, set) => {
					regSet(data, set);
					data.longStrings.clear();
					data.maybeIncludesStrings = false;
				},
				range: (data, start, end) => {
					if (data.first) data.singleChars.addRange(start, end);
					else data.singleChars.intersection(regenerate().addRange(start, end));
					data.longStrings.clear();
					data.maybeIncludesStrings = false;
				},
				iuRange: (data, start, end, caseEqFlags) => {
					if (data.first) data.singleChars.iuAddRange(start, end, caseEqFlags);
					else data.singleChars.intersection(regenerate().iuAddRange(start, end, caseEqFlags));
					data.longStrings.clear();
					data.maybeIncludesStrings = false;
				},
				nested: (data, nestedData) => {
					regSet(data, nestedData.singleChars);

					if (data.first) {
						data.longStrings = nestedData.longStrings;
						data.maybeIncludesStrings = nestedData.maybeIncludesStrings;
					} else {
						for (const str of data.longStrings) {
							if (!nestedData.longStrings.has(str)) data.longStrings.delete(str);
						}
						if (!nestedData.maybeIncludesStrings) data.maybeIncludesStrings = false;
					}
				}
			};
		}
		case 'subtraction': {
			const regSet = (data, set2) => {
				if (data.first) data.singleChars.add(set2);
				else data.singleChars.remove(set2);
			};
			return {
				single: (data, cp) => {
					if (data.first) data.singleChars.add(cp);
					else data.singleChars.remove(cp);
				},
				regSet: regSet,
				range: (data, start, end) => {
					if (data.first) data.singleChars.addRange(start, end);
					else data.singleChars.removeRange(start, end);
				},
				iuRange: (data, start, end, caseEqFlags) => {
					if (data.first) data.singleChars.iuAddRange(start, end, caseEqFlags);
					else data.singleChars.iuRemoveRange(start, end, caseEqFlags);
				},
				nested: (data, nestedData) => {
					regSet(data, nestedData.singleChars);

					if (data.first) {
						data.longStrings = nestedData.longStrings;
						data.maybeIncludesStrings = nestedData.maybeIncludesStrings;
					} else {
						for (const str of data.longStrings) {
							if (nestedData.longStrings.has(str)) data.longStrings.delete(str);
						}
					}
				}
			};
		}
		// The `default` clause is only here as a safeguard; it should never be
		// reached. Code coverage tools should ignore it.
		/* node:coverage ignore next */
		default:
			throw new Error(`Unknown set action: ${ characterClassItem.kind }`);
	}
};

const getCharacterClassEmptyData = () => ({
	transformed: config.transform.unicodeFlag,
	singleChars: regenerate(),
	longStrings: new Set(),
	hasEmptyString: false,
	first: true,
	maybeIncludesStrings: false
});

const concatCaseEquivalents = (codePoint, caseEqFlags) => {
	const caseEquivalents = getCaseEquivalents(codePoint, caseEqFlags);
	if (caseEquivalents) {
		return [codePoint, ...caseEquivalents];
	}
	return [codePoint];
};

const computeClassStrings = (classStrings, regenerateOptions, caseEqFlags, shouldApplySCF) => {
	let data = getCharacterClassEmptyData();

	for (const string of classStrings.strings) {
		if (string.characters.length === 1) {
			const codePoint = shouldApplySCF ? simpleCaseFolding(string.characters[0].codePoint) : string.characters[0].codePoint
			concatCaseEquivalents(codePoint, caseEqFlags).forEach((cp) => {
				data.singleChars.add(cp);
			});
		} else {
			let stringifiedString = '';
			if (caseEqFlags) {
				for (const ch of string.characters) {
					const codePoint = shouldApplySCF ? simpleCaseFolding(ch.codePoint) : ch.codePoint;
					const set = regenerate(concatCaseEquivalents(codePoint, caseEqFlags));
					stringifiedString += set.toString(regenerateOptions);
				}
			} else {
				for (const ch of string.characters) {
					const codePoint = shouldApplySCF ? simpleCaseFolding(ch.codePoint) : ch.codePoint;
					if (codePoint !== ch.codePoint) {
						stringifiedString += regenerate(codePoint).toString(regenerateOptions);
					} else {
						stringifiedString += generate(ch);
					}
				}
			}

			data.longStrings.add(stringifiedString);
			data.maybeIncludesStrings = true;
		}
	}

	return data;
}

const computeCharacterClass = (characterClassItem, regenerateOptions, shouldApplySCF) => {
	let data = getCharacterClassEmptyData();

	let handlePositive;
	let handleNegative;

	let caseEqFlags = configGetCaseEqFlags();

	switch (characterClassItem.kind) {
		case 'union':
			handlePositive = buildHandler('union');
			handleNegative = buildHandler('union-negative');
			break;
		case 'intersection':
			handlePositive = buildHandler('intersection');
			handleNegative = buildHandler('subtraction');
			if (config.transform.unicodeSetsFlag) data.transformed = true;
			if (config.isIgnoreCaseMode) {
				shouldApplySCF = true;
			}
			break;
		case 'subtraction':
			handlePositive = buildHandler('subtraction');
			handleNegative = buildHandler('intersection');
			if (config.transform.unicodeSetsFlag) data.transformed = true;
			if (config.isIgnoreCaseMode) {
				shouldApplySCF = true;
			}
			break;
		// The `default` clause is only here as a safeguard; it should never be
		// reached. Code coverage tools should ignore it.
		/* node:coverage ignore next */
		default:
			throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
	}

	for (const item of characterClassItem.body) {
		switch (item.type) {
			case 'value':
				const codePoint = shouldApplySCF ? simpleCaseFolding(item.codePoint) : item.codePoint;
				const list = concatCaseEquivalents(codePoint, caseEqFlags);
				handlePositive.regSet(data, regenerate(list));
				if (list.length > 1) {
					data.transformed = true;
				}
				break;
			case 'characterClassRange':
				const min = item.min.codePoint;
				const max = item.max.codePoint;
				if (shouldApplySCF) {
					let list = [];
					for (let cp = min; cp <= max; cp++) {
						list.push(simpleCaseFolding(cp));
					}
					handlePositive.regSet(data, regenerate(list));
				} else {
					handlePositive.range(data, min, max);
				}
				if (caseEqFlags) {
					// If shouldApplySCF is true, it is still ok to call iuRange because 
					// the set [min, max] shares the same case equivalents with scf([min, max])
					handlePositive.iuRange(data, min, max, caseEqFlags);
					data.transformed = true;
				}
				break;
			case 'characterClassEscape':
				handlePositive.regSet(data, getCharacterClassEscapeSet(
					item.value,
					config.flags.unicode || config.flags.unicodeSets,
					config.flags.ignoreCase,
					shouldApplySCF
				));
				break;
			case 'unicodePropertyEscape':
				const nestedData = getUnicodePropertyEscapeCharacterClassData(
					item.value,
					item.negative,
					config.flags.unicodeSets && config.isIgnoreCaseMode,
					shouldApplySCF
				);
				handlePositive.nested(data, nestedData);
				data.transformed =
					data.transformed ||
					config.transform.unicodePropertyEscapes ||
					(config.transform.unicodeSetsFlag && (nestedData.maybeIncludesStrings || characterClassItem.kind !== "union" || item.negative));
				break;
			case 'characterClass':
				const handler = item.negative ? handleNegative : handlePositive;
				const res = computeCharacterClass(item, regenerateOptions, shouldApplySCF);
				handler.nested(data, res);
				data.transformed = true;
				break;
			case 'classStrings':
				handlePositive.nested(data, computeClassStrings(item, regenerateOptions, caseEqFlags, shouldApplySCF));
				data.transformed = true;
				break;
			// The `default` clause is only here as a safeguard; it should never be
			// reached. Code coverage tools should ignore it.
			/* node:coverage ignore next */
			default:
				throw new Error(`Unknown term type: ${ item.type }`);
		}

		data.first = false;
	}

	if (characterClassItem.negative && data.maybeIncludesStrings) {
		throw new SyntaxError('Cannot negate set containing strings');
	}

	return data;
}

const processCharacterClass = (
	characterClassItem,
	regenerateOptions,
	computed = computeCharacterClass(characterClassItem, regenerateOptions)
) => {
	const negative = characterClassItem.negative;
	const { singleChars, transformed, longStrings } = computed;
	if (transformed) {
		// If single chars already contains some astral character, regenerate (bmpOnly: true) will create valid regex strings
		const bmpOnly = regenerateContainsAstral(singleChars);
		const setStr = singleChars.toString(Object.assign({}, regenerateOptions, { bmpOnly: bmpOnly }));

		if (negative) {
			if (config.useUnicodeFlag) {
				update(characterClassItem, `[^${setStr[0] === '[' ? setStr.slice(1, -1) : setStr}]`)
			} else {
				if (config.flags.unicode || config.flags.unicodeSets) {
					if (config.flags.ignoreCase) {
						const astralCharsSet = singleChars.clone().intersection(ASTRAL_SET);
						// Assumption: singleChars do not contain lone surrogates.
						// Regex like /[^\ud800]/u is not supported
						const surrogateOrBMPSetStr = singleChars
							.clone()
							.remove(astralCharsSet)
							.addRange(0xd800, 0xdfff)
							.toString({ bmpOnly: true });
						// Don't generate negative lookahead for astral characters
						// because the case folding is not working anyway as we break
						// code points into surrogate pairs.
						const astralNegativeSetStr = ASTRAL_SET
							.clone()
							.remove(astralCharsSet)
							.toString(regenerateOptions);
						// The transform here does not support lone surrogates.
						update(
							characterClassItem,
							`(?!${surrogateOrBMPSetStr})[^]|${astralNegativeSetStr}`
						);
					} else {
						// Generate negative set directly when case folding is not involved.
						const negativeSet = UNICODE_SET.clone().remove(singleChars);
						update(characterClassItem, negativeSet.toString(regenerateOptions));
					}
				} else {
					update(characterClassItem, `(?!${setStr})[^]`);
				}
			}
		} else {
			const hasEmptyString = longStrings.has('');
			const pieces = Array.from(longStrings).sort((a, b) => b.length - a.length);

			if (setStr !== '[]' || longStrings.size === 0) {
				pieces.splice(pieces.length - (hasEmptyString ? 1 : 0), 0, setStr);
			}

			update(characterClassItem, pieces.join('|'));
		}
	}
	return characterClassItem;
};

const assertNoUnmatchedReferences = (groups) => {
	const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
	if (unmatchedReferencesNames.length > 0) {
		throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
	}
};

const processModifiers = (item, regenerateOptions, groups) => {
	const enabling = item.modifierFlags.enabling;
	const disabling = item.modifierFlags.disabling;

	const oldData = Object.assign({}, config.modifiersData);

	for (const flag of enabling) {
		config.modifiersData[flag] = true;
	}
	for (const flag of disabling) {
		config.modifiersData[flag] = false;
	}

	if (config.transform.modifiers) {
		delete item.modifierFlags;
		item.behavior = 'ignore';
	}

	item.body = item.body.map(term => {
		return processTerm(term, regenerateOptions, groups);
	});

	config.modifiersData = oldData;

	return item;
}

const processTerm = (item, regenerateOptions, groups) => {
	switch (item.type) {
		case 'dot':
			if (config.transform.unicodeFlag) {
				update(
					item,
					getUnicodeDotSet(config.isDotAllMode).toString(regenerateOptions)
				);
			} else if ((config.modifiersData.s != null ? config.modifiersData.s && config.transform.modifiers : config.transform.dotAllFlag)) {
				// TODO: consider changing this at the regenerate level.
				update(item, '[^]');
			}
			break;
		case 'characterClass':
			item = processCharacterClass(item, regenerateOptions);
			break;
		case 'unicodePropertyEscape':
			const data = getUnicodePropertyEscapeCharacterClassData(item.value, item.negative, config.flags.unicodeSets && config.isIgnoreCaseMode);
			if (data.maybeIncludesStrings) {
				if (!config.flags.unicodeSets) {
					throw new Error(
						'Properties of strings are only supported when using the unicodeSets (v) flag.'
					);
				}
				if (config.transform.unicodeSetsFlag) {
					data.transformed = true;
					item = processCharacterClass(item, regenerateOptions, data);
				}
			} else if (config.transform.unicodePropertyEscapes || configGetCaseEqFlags()) {
				update(
					item,
					data.singleChars.toString(regenerateOptions)
				);
			}
			break;
		case 'characterClassEscape':
			if (config.transform.unicodeFlag) {
				update(
					item,
					getCharacterClassEscapeSet(
						item.value,
						/* config.transform.unicodeFlag implies config.flags.unicode */ true,
						config.flags.ignoreCase
					).toString(regenerateOptions)
				);
			}
			break;
		case 'group':
			if (item.behavior == 'normal') {
				groups.lastIndex++;
			}
			if (item.name) {
				const name = item.name.value;

				if (groups.namesConflicts[name]) {
					throw new Error(
						`Group '${ name }' has already been defined in this context.`
					);
				}
				groups.namesConflicts[name] = true;

				if (config.transform.namedGroups) {
					delete item.name;
				}

				const index = groups.lastIndex;
				if (!groups.names[name]) {
					groups.names[name] = [];
				}
				groups.names[name].push(index);

				if (groups.onNamedGroup) {
					groups.onNamedGroup.call(null, name, index);
				}

				if (groups.unmatchedReferences[name]) {
					delete groups.unmatchedReferences[name];
				}
			}
			if (item.modifierFlags) {
				return processModifiers(item, regenerateOptions, groups);
			}
			/* falls through */
		case 'quantifier':
			item.body = item.body.map(term => {
				return processTerm(term, regenerateOptions, groups);
			});
			break;
		case 'disjunction':
			const outerNamesConflicts = groups.namesConflicts;
			item.body = item.body.map(term => {
				groups.namesConflicts = Object.create(outerNamesConflicts);
				return processTerm(term, regenerateOptions, groups);
			});
			break;
		case 'alternative':
			item.body = flatMap(item.body, term => {
				const res = processTerm(term, regenerateOptions, groups);
				// Alternatives cannot contain alternatives; flatten them.
				return res.type === 'alternative' ? res.body : res;
			});
			break;
		case 'value':
			const codePoint = item.codePoint;
			const caseEqFlags = configGetCaseEqFlags();
			const list = concatCaseEquivalents(codePoint, caseEqFlags);
			if (list.length === 1 && item.kind === "symbol" && codePoint >= 0x20 && codePoint <= 0x7E) {
				// skip regenerate when it is a printable ASCII symbol
				break;
			}
			const set = regenerate(list);
			update(item, set.toString(regenerateOptions));
			break;
		case 'reference':
			if (item.name) {
				const name = item.name.value;
				const indexes = groups.names[name];
				if (!indexes) {
					groups.unmatchedReferences[name] = true;
				}

				if (config.transform.namedGroups) {
					if (indexes) {
						const body = indexes.map(index => ({
							'type': 'reference',
							'matchIndex': index,
							'raw': '\\' + index,
						}));
						if (body.length === 1) {
							return body[0];
						}
						return {
							'type': 'alternative',
							'body': body,
							'raw': body.map(term => term.raw).join(''),
						};
					}

					// This named reference comes before the group where it’s defined,
					// so it’s always an empty match.
					return {
						'type': 'group',
						'behavior': 'ignore',
						'body': [],
						'raw': '(?:)',
					};
				}
			}
			break;
		case 'anchor':
			if (config.modifiersData.m && config.transform.modifiers) {
				if (item.kind == 'start') {
					update(item, `(?:^|(?<=${NEWLINE_SET.toString()}))`);
				} else if (item.kind == 'end') {
					update(item, `(?:$|(?=${NEWLINE_SET.toString()}))`);
				}
			}
		case 'empty':
			// Nothing to do here.
			break;
		// The `default` clause is only here as a safeguard; it should never be
		// reached. Code coverage tools should ignore it.
		/* node:coverage ignore next */
		default:
			throw new Error(`Unknown term type: ${ item.type }`);
	}
	return item;
};

const config = {
	'flags': {
		'ignoreCase': false,
		'unicode': false,
		'unicodeSets': false,
		'dotAll': false,
		'multiline': false,
	},
	'transform': {
		'dotAllFlag': false,
		'unicodeFlag': false,
		'unicodeSetsFlag': false,
		'unicodePropertyEscapes': false,
		'namedGroups': false,
		'modifiers': false,
	},
	'modifiersData': {
		'i': undefined,
		's': undefined,
		'm': undefined,
	},
	get useUnicodeFlag() {
		return (this.flags.unicode || this.flags.unicodeSets) && !this.transform.unicodeFlag;
	},
	get isDotAllMode() {
		return (this.modifiersData.s !== undefined ? this.modifiersData.s : this.flags.dotAll);
	},
	get isIgnoreCaseMode() {
		return (this.modifiersData.i !== undefined ? this.modifiersData.i : this.flags.ignoreCase);
	}
};

const validateOptions = (options) => {
	if (!options) return;

	for (const key of Object.keys(options)) {
		const value = options[key];
		switch (key) {
			case 'dotAllFlag':
			case 'unicodeFlag':
			case 'unicodePropertyEscapes':
			case 'unicodeSetsFlag':
			case 'namedGroups':
				if (value != null && value !== false && value !== 'transform') {
					throw new Error(`.${key} must be false (default) or 'transform'.`);
				}
				break;
			// todo: remove modifiers: 'parse' in regexpu-core v7
			case 'modifiers':
				if (value != null && value !== false && value !== 'parse' && value !== 'transform') {
					throw new Error(`.${key} must be false (default), 'parse' or 'transform'.`);
				}
				break;
			case 'onNamedGroup':
			case 'onNewFlags':
				if (value != null && typeof value !== 'function') {
					throw new Error(`.${key} must be a function.`);
				}
				break;
			default:
				throw new Error(`.${key} is not a valid regexpu-core option.`);
		}
	}
};

const hasFlag = (flags, flag) => flags ? flags.includes(flag) : false;
const transform = (options, name) => options ? options[name] === 'transform' : false;

const rewritePattern = (pattern, flags, options) => {
	validateOptions(options);

	config.flags.unicode = hasFlag(flags, 'u');
	config.flags.unicodeSets = hasFlag(flags, 'v');
	config.flags.ignoreCase = hasFlag(flags, 'i');
	config.flags.dotAll = hasFlag(flags, 's');
	config.flags.multiline = hasFlag(flags, 'm');

	config.transform.dotAllFlag = config.flags.dotAll && transform(options, 'dotAllFlag');
	config.transform.unicodeFlag = (config.flags.unicode || config.flags.unicodeSets) && transform(options, 'unicodeFlag');
	config.transform.unicodeSetsFlag = config.flags.unicodeSets && transform(options, 'unicodeSetsFlag');

	// unicodeFlag: 'transform' implies unicodePropertyEscapes: 'transform'
	config.transform.unicodePropertyEscapes = (config.flags.unicode || config.flags.unicodeSets) && (
		transform(options, 'unicodeFlag') || transform(options, 'unicodePropertyEscapes')
	);
	config.transform.namedGroups = transform(options, 'namedGroups');
	config.transform.modifiers = transform(options, 'modifiers');

	config.modifiersData.i = undefined;
	config.modifiersData.s = undefined;
	config.modifiersData.m = undefined;

	const regjsparserFeatures = {
		// Enable every stable RegExp feature by default
		'modifiers': true,
		'unicodePropertyEscape': true,
		'unicodeSet': true,
		'namedGroups': true,
		'lookbehind': true,
	};

	const regenerateOptions = {
		'hasUnicodeFlag': config.useUnicodeFlag,
		'bmpOnly': !config.flags.unicode && !config.flags.unicodeSets
	};

	const groups = {
		'onNamedGroup': options && options.onNamedGroup,
		'lastIndex': 0,
		'names': Object.create(null), // { [name]: Array }
		'namesConflicts': Object.create(null), // { [name]: true }
		'unmatchedReferences': Object.create(null) // { [name]: true }
	};

	const tree = parse(pattern, flags, regjsparserFeatures);

	if (config.transform.modifiers) {
		if (/\(\?[a-z]*-[a-z]+:/.test(pattern)) {
			// the pattern _likely_ contain inline disabled modifiers
			// we need to traverse to make sure that they are actually modifiers and to collect them
			const allDisabledModifiers = Object.create(null)
			const itemStack = [tree];
			let node;
			while (node = itemStack.pop(), node != undefined) {
				if (Array.isArray(node)) {
					Array.prototype.push.apply(itemStack, node);
				} else if (typeof node == 'object' && node != null) {
					for (const key of Object.keys(node)) {
						const value = node[key];
						if (key == 'modifierFlags') {
							for (const flag of value.disabling) {
								allDisabledModifiers[flag] = true;
							}
						} else if (typeof value == 'object' && value != null) {
							itemStack.push(value);
						}
					}
				}
			}
			if (allDisabledModifiers.i) {
				config.modifiersData.i = config.flags.ignoreCase;
			}
			if (allDisabledModifiers.m) {
				config.modifiersData.m = config.flags.multiline;
			}
			if (allDisabledModifiers.s) {
				config.modifiersData.s = config.flags.dotAll;
			}
		}
	}

	// Note: `processTerm` mutates `tree` and `groups`.
	processTerm(tree, regenerateOptions, groups);
	assertNoUnmatchedReferences(groups);

	const onNewFlags = options && options.onNewFlags;
	if (onNewFlags) {
		let newFlags = flags.split('').filter((flag) => !config.modifiersData[flag]).join('');
		if (config.transform.unicodeSetsFlag) {
			newFlags = newFlags.replace('v', 'u');
		}
		if (config.transform.unicodeFlag) {
			newFlags = newFlags.replace('u', '');
		}
		if (config.transform.dotAllFlag) {
			newFlags = newFlags.replace('s', '');
		}
		onNewFlags(newFlags);
	}

	return generate(tree);
};

module.exports = rewritePattern;




© 2015 - 2024 Weber Informatics LLC | Privacy Policy