io.jenetics.ext.grammar.Cfg Maven / Gradle / Ivy
The newest version!
/*
* Java Genetic Algorithm Library (jenetics-8.1.0).
* Copyright (c) 2007-2024 Franz Wilhelmstötter
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Author:
* Franz Wilhelmstötter ([email protected])
*/
package io.jenetics.ext.grammar;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.toCollection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Represents a context-free grammar
* (CFG).
*
* Formal definition
*
* A context-free grammar {@code G} is defined by the 4-tuple
* {@code G = (N, T, R, S)}, where
*
* - {@code N} is a finite set; each element {@code n ∈ N} is called a
* non-terminal ({@link NonTerminal}) character or a variable. Each
* variable represents a different type of phrase or clause in the sentence.
* Variables are also sometimes called syntactic categories. Each variable
* defines a sub-language of the language defined by {@code G}.
*
* - {@code T} is a finite set of terminals ({@link Terminal}) disjoint
* from {@code N}, which make up the actual content of the sentence. The set
* of terminals is the alphabet of the language defined by the grammar
* {@code G}.
*
* - {@code R} is a finite relation in {@code N × (N ∪ T)∗}, where the
* asterisk represents the
* Kleene star operation. The members of {@code R} are called the
* (rewrite) rules ({@link Rule}) or productions of the grammar.
*
* - {@code S} is the start variable (or start symbol), used to represent
* the whole sentence (or program). It must be an element of {@code N}
* ({@link NonTerminal})
* .
*
*
* You can easily create a Cfg object from a given BNF grammar.
* {@snippet lang="java":
* final Cfg grammar = Bnf.parse("""
* ::= | | '(' ')'
* ::= + | - | * | /
* ::= x | y
* ::= 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
* """
* );
* }
*
* It is also possible to create the grammar above programmatically.
* {@snippet lang="java":
* final Cfg grammar = Cfg.of(
* R("expr",
* E(NT("num")),
* E(NT("var")),
* E(T("("), NT("expr"), NT("op"), NT("expr"), T(")"))
* ),
* R("op", E(T("+")), E(T("-")), E(T("*")), E(T("/"))),
* R("var", E(T("x")), E(T("y"))),
* R("num",
* E(T("0")), E(T("1")), E(T("2")), E(T("3")),
* E(T("4")), E(T("5")), E(T("6")), E(T("7")),
* E(T("8")), E(T("9"))
* )
* );
* }
*
* @see Bnf#parse(String)
*
* @param the terminal symbol value type
*
* @author Franz Wilhelmstötter
* @since 7.1
* @version 7.1
*/
public record Cfg(
List> nonTerminals,
List> terminals,
List> rules,
NonTerminal start
) {
/**
* Represents the symbols the BNF grammar consists.
*
* @param the terminal symbol value type
*/
public sealed interface Symbol {
/**
* Return the name of the symbol.
*
* @return the name of the symbol
*/
String name();
}
/**
* Represents the non-terminal symbols of the grammar ({@code NT}).
*
* @param the terminal symbol value type
*/
public record NonTerminal(String name) implements Symbol {
/**
* @param name the name of the non-terminal symbol
* @throws IllegalArgumentException if the given {@code name} is not
* a valid non-terminal name
* @throws NullPointerException if one of the arguments is {@code null}
*/
public NonTerminal {
if (name.isEmpty()) {
throw new IllegalArgumentException(
"Non-terminal value must not be empty."
);
}
}
}
/**
* Represents a terminal symbols of the grammar ({@code T}).
*
* @param the terminal symbol value type
*/
public record Terminal(String name, T value) implements Symbol {
/**
* @param name the name of the terminal symbol
* @param value the value of the terminal symbol
* @throws IllegalArgumentException if the given terminal {@code name}
* is empty
*/
public Terminal {
if (name.isEmpty()) {
throw new IllegalArgumentException(
"Terminal value must not be empty."
);
}
}
/**
* Return a new terminal symbol where the name of the symbol is equal
* to its value.
*
* @param name the name (and value) of the terminal symbol
* @return a new terminal symbol with the given {@code name}
* @throws IllegalArgumentException if the given terminal {@code name}
* is empty
*/
public static Terminal of(final String name) {
return new Terminal<>(name, name);
}
}
/**
* Represents one expression (list of alternative symbols) a
* production rule consists of.
*
* @param the terminal symbol value type
*/
public record Expression(List> symbols) {
/**
* @param symbols the list of symbols of the expression
* @throws IllegalArgumentException if the list of {@code symbols} is
* empty
*/
public Expression {
if (symbols.isEmpty()) {
throw new IllegalArgumentException(
"The list of symbols must not be empty."
);
}
symbols = List.copyOf(symbols);
}
}
/**
* Represents a production rule of the grammar ({@code R}).
*
* @param the terminal symbol value type
*/
public record Rule(NonTerminal start, List> alternatives) {
/**
* Creates a new rule object.
*
* @param start the start symbol of the rule
* @param alternatives the list of alternative rule expressions
* @throws IllegalArgumentException if the given list of
* {@code alternatives} is empty
* @throws NullPointerException if one of the arguments is {@code null}
*/
public Rule {
requireNonNull(start);
if (alternatives.isEmpty()) {
throw new IllegalArgumentException(
"Rule alternatives must not be empty."
);
}
alternatives = List.copyOf(alternatives);
}
}
/**
* Create a new context-free grammar object.
*
* @param nonTerminals the non-terminal symbols of {@code this} grammar
* @param terminals the terminal symbols of {@code this} grammar
* @param rules the production rules of {@code this} grammar
* @param start the start symbol of {@code this} grammar
* @throws NullPointerException if one of the arguments is {@code null}
* @throws IllegalArgumentException if a rule is defined more than once, the
* start symbol points to a missing rule or the rules uses symbols
* not defined in the list of {@link #nonTerminals()} or
* {@link #terminals()}
*/
public Cfg {
if (rules.isEmpty()) {
throw new IllegalArgumentException(
"The given list of rules must not be empty."
);
}
// Check the uniqueness of the rules.
final var duplicatedRules = rules.stream()
.collect(Collectors.groupingBy(Rule::start))
.values().stream()
.filter(values -> values.size() > 1)
.map(rule -> rule.get(0).start.name)
.toList();
if (!duplicatedRules.isEmpty()) {
throw new IllegalArgumentException(
"Found duplicate rule(s), " + duplicatedRules + "."
);
}
// Check if start symbol points to an existing rule.
final var startRule = rules.stream()
.filter(r -> start.equals(r.start))
.findFirst();
if (startRule.isEmpty()) {
throw new IllegalArgumentException(
"No rule found for start symbol %s.".formatted(start)
);
}
// Check that all symbols used in the given rules are also defined
// in the list of non-terminals and terminals.
final Set> required = rules.stream()
.flatMap(Cfg::ruleSymbols)
.collect(Collectors.toUnmodifiableSet());
final Set> available = Stream
.concat(nonTerminals.stream(), terminals.stream())
.collect(Collectors.toUnmodifiableSet());
final var missing = new HashSet<>(required);
missing.removeAll(available);
if (!missing.isEmpty()) {
throw new IllegalArgumentException(
"Unknown symbols defined in rules: " + missing
);
}
// Check if the name of terminals and non-terminals are distinct.
final var terminalNames = terminals.stream()
.map(Symbol::name)
.collect(Collectors.toSet());
final var nonTerminalNames = nonTerminals.stream()
.map(Symbol::name)
.collect(Collectors.toSet());
terminalNames.retainAll(nonTerminalNames);
if (!terminalNames.isEmpty()) {
throw new IllegalArgumentException(format(
"Terminal and non-terminal symbols with same name: %s",
terminalNames.stream().sorted().toList()
));
}
nonTerminals = List.copyOf(nonTerminals);
terminals = List.copyOf(terminals);
rules = List.copyOf(rules);
requireNonNull(start);
}
/**
* Return the rule for the given {@code start} symbol.
*
* @param start the start symbol of the rule
* @return the rule for the given {@code start} symbol
* @throws NullPointerException if the given {@code start} symbol is
* {@code null}
*/
public Optional> rule(final NonTerminal> start) {
requireNonNull(start);
for (var rule : rules) {
if (rule.start().name().equals(start.name())) {
return Optional.of(rule);
}
}
return Optional.empty();
}
/**
* Maps the values of the terminal symbols from type {@code T} to type
* {@code A}.
*
* @param mapper the mapper function
* @param the new value type of the terminal symbols
* @return the mapped grammar
* @throws NullPointerException if the given mapper is {@code null}
*/
public Cfg map(final Function super Terminal, ? extends A> mapper) {
requireNonNull(mapper);
final var cache = new HashMap, Terminal>();
final Function, Terminal> mapping = t -> cache
.computeIfAbsent(t, t2 -> new Terminal<>(t2.name(), mapper.apply(t2)));
@SuppressWarnings("unchecked")
final List> rules = rules().stream()
.map(rule -> new Rule<>(
(NonTerminal)rule.start(),
rule.alternatives().stream()
.map(expr -> new Expression<>(
expr.symbols().stream()
.map(sym -> sym instanceof Cfg.Terminal t
? mapping.apply(t) : (Symbol)sym)
.toList()
))
.toList()
))
.toList();
return Cfg.of(rules);
}
/**
* Create a grammar object with the given rules. Duplicated rules are merged
* into one rule. The start symbol of the first rule is chosen as
* the start symbol of the created CFG
*
* @param rules the rules the grammar consists of
* @throws IllegalArgumentException if the list of rules is empty
* @throws NullPointerException if the list of rules is {@code null}
*/
public static Cfg of(final List> rules) {
if (rules.isEmpty()) {
throw new IllegalArgumentException(
"The list of rules must not be empty."
);
}
final List> normalizedRules = normalize(rules);
final List> symbols = normalizedRules.stream()
.flatMap(Cfg::ruleSymbols)
.distinct()
.toList();
final List> nonTerminals = symbols.stream()
.filter(NonTerminal.class::isInstance)
.map(nt -> (NonTerminal)nt)
.toList();
final List> terminals = symbols.stream()
.filter(Terminal.class::isInstance)
.map(nt -> (Terminal)nt)
.toList();
return new Cfg<>(
nonTerminals,
terminals,
normalizedRules.stream()
.map(r -> rebuild(r, symbols))
.toList(),
(NonTerminal)select(normalizedRules.get(0).start(), symbols)
);
}
/**
* Create a grammar object with the given rules. Duplicated rules are merged
* into one rule. The start symbol of the first rule is chosen as
* the start symbol of the created CFG
*
* @param rules the rules the grammar consists of
* @throws IllegalArgumentException if the list of rules is empty
* @throws NullPointerException if the list of rules is {@code null}
*/
@SafeVarargs
public static Cfg of(final Rule... rules) {
return Cfg.of(List.of(rules));
}
private static List> normalize(final List> rules) {
final Map, List>> grouped = rules.stream()
.collect(groupingBy(
Rule::start,
LinkedHashMap::new,
toCollection(ArrayList::new)));
return grouped.entrySet().stream()
.map(entry -> merge(entry.getKey(), entry.getValue()))
.toList();
}
private static Rule merge(final NonTerminal start, final List> rules) {
return new Rule<>(
start,
rules.stream()
.flatMap(rule -> rule.alternatives().stream())
.toList()
);
}
private static Stream> ruleSymbols(final Rule rule) {
return Stream.concat(
Stream.of(rule.start),
rule.alternatives.stream()
.flatMap(expr -> expr.symbols().stream())
);
}
private static Rule rebuild(final Rule rule, final List> symbols) {
return new Rule<>(
(NonTerminal)select(rule.start, symbols),
rule.alternatives.stream()
.map(e -> rebuild(e, symbols))
.toList()
);
}
private static Expression
rebuild(final Expression expression, final List> symbols) {
return new Expression<>(
expression.symbols.stream()
.map(s -> select(s, symbols))
.toList()
);
}
private static Symbol select(
final Symbol symbol,
final List> symbols
) {
for (var s : symbols) {
if (s.name().equals(symbol.name())) {
return s;
}
}
throw new AssertionError("Symbol not found: " + symbol);
}
@SuppressWarnings("unchecked")
static Cfg upcast(final Cfg seq) {
return (Cfg)seq;
}
/* *************************************************************************
* Static factory methods for rule creation.
* ************************************************************************/
/**
* Factory method for creating a terminal symbol with the given
* {@code name} and {@code value}.
*
* @param name the name of the terminal symbol
* @param value the value of the terminal symbol
* @param the terminal symbol value type
* @return a new terminal symbol
*/
public static Terminal T(final String name, final T value) {
return new Terminal<>(name, value);
}
/**
* Factory method for creating a terminal symbol with the given
* {@code name}.
*
* @param name the name of the terminal symbol
* @return a new terminal symbol
*/
public static Terminal T(final String name) {
return new Terminal<>(name, name);
}
/**
* Factory method for creating non-terminal symbols.
*
* @param name the name of the symbol.
* @param the terminal symbol value type
* @return a new non-terminal symbol
*/
public static NonTerminal N(final String name) {
return new NonTerminal<>(name);
}
/**
* Factory method for creating an expression with the given
* {@code symbols}.
*
* @param symbols the list of symbols of the expression
* @throws IllegalArgumentException if the list of {@code symbols} is
* empty
* @param the terminal symbol value type
* @return a new expression
*/
@SafeVarargs
public static Expression E(final Symbol... symbols) {
return new Expression<>(List.of(symbols));
}
/**
* Factory method for creating a new rule.
*
* @param name the name of start symbol of the rule
* @param alternatives the list af alternative rule expressions
* @throws IllegalArgumentException if the given list of
* {@code alternatives} is empty
* @throws NullPointerException if one of the arguments is {@code null}
* @param the terminal symbol value type
* @return a new rule
*/
@SafeVarargs
public static Rule R(
final String name,
final Expression... alternatives
) {
return new Rule<>(new NonTerminal<>(name), List.of(alternatives));
}
}