All Downloads are FREE. Search and download functionalities are using the official Maven repository.

prettify.parser.Prettify Maven / Gradle / Ivy

Go to download

Synchronous or asynchronous syntax highlighter for RichTextFX StyleClassedTextArea

The newest version!
// Copyright (C) 2006 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package prettify.parser;

import prettify.lang.*;

import java.lang.reflect.Method;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

/**
 * This is similar to the syntaxhighlighterfx.js in JavaScript Prettify.
 * 
 * All comments are adapted from the JavaScript Prettify.
 * 
 * 

* Some functions for browser-side pretty printing of code contained in html. *

* *

* For a fairly comprehensive set of languages see the * README * file that came with this source. At a minimum, the lexer should work on a * number of languages including C and friends, Java, Python, Bash, SQL, HTML, * XML, CSS, Javascript, and Makefiles. It works passably on Ruby, PHP and Awk * and a subset of Perl, but, because of commenting conventions, doesn't work on * Smalltalk, Lisp-like, or CAML-like languages without an explicit lang class. *

* Usage:

    *
  1. include this source file in an html page via * {@code } *
  2. define style rules. See the example page for examples. *
  3. mark the {@code
    } and {@code } tags in your source with
     *    {@code class=prettyprint.}
     *    You can also use the (html deprecated) {@code } tag, but the pretty
     *    printer needs to do more substantial DOM manipulations to support that, so
     *    some css styles may not be preserved.
     * </ol>
     * That's it.  I wanted to keep the API as simple as possible, so there's no
     * need to specify which language the code is in, but if you wish, you can add
     * another class to the {@code <pre>} or {@code <code>} element to specify the
     * language, as in {@code <pre class="prettyprint lang-java">}.  Any class that
     * starts with "lang-" followed by a file extension, specifies the file type.
     * See the "lang-*.js" files in this directory for code that implements
     * per-language file handlers.
     * <p>
     * Change log:<br>
     * cbeust, 2006/08/22
     * <blockquote>
     *   Java annotations (start with "@") are now captured as literals ("lit")
     * </blockquote>
     */
    public class Prettify {
    
      private static final Logger LOG = Logger.getLogger(Prettify.class.getName());
      // Keyword lists for various languages.
      public static final String FLOW_CONTROL_KEYWORDS = "break,continue,do,else,for,if,return,while";
      public static final String C_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "auto,case,char,const,default,"
              + "double,enum,extern,float,goto,inline,int,long,register,short,signed,"
              + "sizeof,static,struct,switch,typedef,union,unsigned,void,volatile";
      public static final String COMMON_KEYWORDS = C_KEYWORDS + "," + "catch,class,delete,false,import,"
              + "new,operator,private,protected,public,this,throw,true,try,typeof";
      public static final String CPP_KEYWORDS = COMMON_KEYWORDS + "," + "alignof,align_union,asm,axiom,bool,"
              + "concept,concept_map,const_cast,constexpr,decltype,delegate,"
              + "dynamic_cast,explicit,export,friend,generic,late_check,"
              + "mutable,namespace,nullptr,property,reinterpret_cast,static_assert,"
              + "static_cast,template,typeid,typename,using,virtual,where";
      public static final String JAVA_KEYWORDS = COMMON_KEYWORDS + ","
              + "abstract,assert,boolean,byte,extends,final,finally,implements,import,"
              + "instanceof,interface,null,native,package,strictfp,super,synchronized,"
              + "throws,transient";
      public static final String RUST_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "as,assert,const,copy,drop,"
              + "enum,extern,fail,false,fn,impl,let,log,loop,match,mod,move,mut,priv,"
              + "pub,pure,ref,self,static,struct,true,trait,type,unsafe,use";
      public static final String CSHARP_KEYWORDS = JAVA_KEYWORDS + ","
              + "as,base,by,checked,decimal,delegate,descending,dynamic,event,"
              + "fixed,foreach,from,group,implicit,in,internal,into,is,let,"
              + "lock,object,out,override,orderby,params,partial,readonly,ref,sbyte,"
              + "sealed,stackalloc,string,select,uint,ulong,unchecked,unsafe,ushort,"
              + "var,virtual,where";
      public static final String COFFEE_KEYWORDS = "all,and,by,catch,class,else,extends,false,finally,"
              + "for,if,in,is,isnt,loop,new,no,not,null,of,off,on,or,return,super,then,"
              + "throw,true,try,unless,until,when,while,yes";
      public static final String JSCRIPT_KEYWORDS = COMMON_KEYWORDS + ","
              + "debugger,eval,export,function,get,null,set,undefined,var,with,"
              + "Infinity,NaN";
      public static final String PERL_KEYWORDS = "caller,delete,die,do,dump,elsif,eval,exit,foreach,for,"
              + "goto,if,import,last,local,my,next,no,our,print,package,redo,require,"
              + "sub,undef,unless,until,use,wantarray,while,BEGIN,END";
      public static final String PYTHON_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "and,as,assert,class,def,del,"
              + "elif,except,exec,finally,from,global,import,in,is,lambda,"
              + "nonlocal,not,or,pass,print,raise,try,with,yield,"
              + "False,True,None";
      public static final String RUBY_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "alias,and,begin,case,class,"
              + "def,defined,elsif,end,ensure,false,in,module,next,nil,not,or,redo,"
              + "rescue,retry,self,super,then,true,undef,unless,until,when,yield,"
              + "BEGIN,END";
      public static final String SH_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "case,done,elif,esac,eval,fi,"
              + "function,in,local,set,then,until";
      public static final String ALL_KEYWORDS = CPP_KEYWORDS + "," + CSHARP_KEYWORDS + "," + JSCRIPT_KEYWORDS + "," + PERL_KEYWORDS + ","
              + PYTHON_KEYWORDS + "," + RUBY_KEYWORDS + "," + SH_KEYWORDS;
      public static final Pattern C_TYPES = Pattern.compile("^(DIR|FILE|vector|(de|priority_)?queue|list|stack|(const_)?iterator|(multi)?(set|map)|bitset|u?(int|float)\\d*)\\b");
      // token style names.  correspond to css classes
      /**
       * token style for a string literal
       */
      public static final String PR_STRING = "str";
      /**
       * token style for a keyword
       */
      public static final String PR_KEYWORD = "kwd";
      /**
       * token style for a comment
       */
      public static final String PR_COMMENT = "com";
      /**
       * token style for a type
       */
      public static final String PR_TYPE = "typ";
      /**
       * token style for a literal value.  e.g. 1, null, true.
       */
      public static final String PR_LITERAL = "lit";
      /**
       * token style for a punctuation string.
       */
      public static final String PR_PUNCTUATION = "pun";
      /**
       * token style for a plain text.
       */
      public static final String PR_PLAIN = "pln";
      /**
       * token style for an sgml tag.
       */
      public static final String PR_TAG = "tag";
      /**
       * token style for a markup declaration such as a DOCTYPE.
       */
      public static final String PR_DECLARATION = "dec";
      /**
       * token style for embedded source.
       */
      public static final String PR_SOURCE = "src";
      /**
       * token style for an sgml attribute name.
       */
      public static final String PR_ATTRIB_NAME = "atn";
      /**
       * token style for an sgml attribute value.
       */
      public static final String PR_ATTRIB_VALUE = "atv";
      /**
       * A class that indicates a section of markup that is not code, e.g. to allow
       * embedding of line numbers within code listings.
       */
      public static final String PR_NOCODE = "nocode";
      /**
       * A set of tokens that can precede a regular expression literal in
       * javascript
       * http://web.archive.org/web/20070717142515/http://www.mozilla.org/js/language/js20/rationale/syntax.html
       * has the full list, but I've removed ones that might be problematic when
       * seen in languages that don't support regular expression literals.
       *
       * <p>Specifically, I've removed any keywords that can't precede a regexp
       * literal in a syntactically legal javascript program, and I've removed the
       * "in" keyword since it's not a keyword in many languages, and might be used
       * as a count of inches.
       *
       * <p>The link above does not accurately describe EcmaScript rules since
       * it fails to distinguish between (a=++/b/i) and (a++/b/i) but it works
       * very well in practice.
       */
      private static final String REGEXP_PRECEDER_PATTERN = "(?:^^\\.?|[+-]|[!=]=?=?|\\#|%=?|&&?=?|\\(|\\*=?|[+\\-]=|->|\\/=?|::?|<<?=?|>>?>?=?|,|;|\\?|@|\\[|~|\\{|\\^\\^?=?|\\|\\|?=?|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*";
      // CAVEAT: this does not properly handle the case where a regular
      // expression immediately follows another since a regular expression may
      // have flags for case-sensitivity and the like.  Having regexp tokens
      // adjacent is not valid in any language I'm aware of, so I'm punting.
      // TODO: maybe style special characters inside a regexp as punctuation.
    
      public Prettify() {
        try {
          Map<String, Object> decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", ALL_KEYWORDS);
          decorateSourceMap.put("hashComments", true);
          decorateSourceMap.put("cStyleComments", true);
          decorateSourceMap.put("multiLineStrings", true);
          decorateSourceMap.put("regexLiterals", true);
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"default-code"}));
    
          List<List<Object>> shortcutStylePatterns, fallthroughStylePatterns;
    
          shortcutStylePatterns = new ArrayList<List<Object>>();
          fallthroughStylePatterns = new ArrayList<List<Object>>();
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN, Pattern.compile("^[^<?]+")}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_DECLARATION, Pattern.compile("^<!\\w[^>]*(?:>|$)")}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT, Pattern.compile("^<\\!--[\\s\\S]*?(?:-\\->|$)")}));
          // Unescaped content in an unknown language
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-", Pattern.compile("^<\\?([\\s\\S]+?)(?:\\?>|$)")}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-", Pattern.compile("^<%([\\s\\S]+?)(?:%>|$)")}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PUNCTUATION, Pattern.compile("^(?:<[%?]|[%?]>)")}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-", Pattern.compile("^<xmp\\b[^>]*>([\\s\\S]+?)<\\/xmp\\b[^>]*>", Pattern.CASE_INSENSITIVE)}));
          // Unescaped content in javascript.  (Or possibly vbscript).
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-js", Pattern.compile("^<script\\b[^>]*>([\\s\\S]*?)(<\\/script\\b[^>]*>)", Pattern.CASE_INSENSITIVE)}));
          // Contains unescaped stylesheet content
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-css", Pattern.compile("^<style\\b[^>]*>([\\s\\S]*?)(<\\/style\\b[^>]*>)", Pattern.CASE_INSENSITIVE)}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-in.tag", Pattern.compile("^(<\\/?[a-z][^<>]*>)", Pattern.CASE_INSENSITIVE)}));
          registerLangHandler(new CreateSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns), Arrays.asList(new String[]{"default-markup", "htm", "html", "mxml", "xhtml", "xml", "xsl"}));
    
          shortcutStylePatterns = new ArrayList<List<Object>>();
          fallthroughStylePatterns = new ArrayList<List<Object>>();
          shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN, Pattern.compile("^[\\s]+"), null, " \t\r\n"}));
          shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_ATTRIB_VALUE, Pattern.compile("^(?:\\\"[^\\\"]*\\\"?|\\'[^\\']*\\'?)"), null, "\"'"}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_TAG, Pattern.compile("^^<\\/?[a-z](?:[\\w.:-]*\\w)?|\\/?>$", Pattern.CASE_INSENSITIVE)}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_ATTRIB_NAME, Pattern.compile("^(?!style[\\s=]|on)[a-z](?:[\\w:-]*\\w)?", Pattern.CASE_INSENSITIVE)}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-uq.val", Pattern.compile("^=\\s*([^>\\'\\\"\\s]*(?:[^>\\'\\\"\\s\\/]|\\/(?=\\s)))", Pattern.CASE_INSENSITIVE)}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PUNCTUATION, Pattern.compile("^[=<>\\/]+")}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-js", Pattern.compile("^on\\w+\\s*=\\s*\\\"([^\\\"]+)\\\"", Pattern.CASE_INSENSITIVE)}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-js", Pattern.compile("^on\\w+\\s*=\\s*\\'([^\\']+)\\'", Pattern.CASE_INSENSITIVE)}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-js", Pattern.compile("^on\\w+\\s*=\\s*([^\\\"\\'>\\s]+)", Pattern.CASE_INSENSITIVE)}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-css", Pattern.compile("^style\\s*=\\s*\\\"([^\\\"]+)\\\"", Pattern.CASE_INSENSITIVE)}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-css", Pattern.compile("^style\\s*=\\s*\\'([^\\']+)\\'", Pattern.CASE_INSENSITIVE)}));
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-css", Pattern.compile("^style\\s*=\\s\\*([^\\\"\\'>\\s]+)", Pattern.CASE_INSENSITIVE)}));
          registerLangHandler(new CreateSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns), Arrays.asList(new String[]{"in.tag"}));
    
          shortcutStylePatterns = new ArrayList<List<Object>>();
          fallthroughStylePatterns = new ArrayList<List<Object>>();
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_ATTRIB_VALUE, Pattern.compile("^[\\s\\S]+")}));
          registerLangHandler(new CreateSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns), Arrays.asList(new String[]{"uq.val"}));
    
          decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", CPP_KEYWORDS);
          decorateSourceMap.put("hashComments", true);
          decorateSourceMap.put("cStyleComments", true);
          decorateSourceMap.put("types", C_TYPES);
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"c", "cc", "cpp", "cxx", "cyc", "m"}));
    
          decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", "null,true,false");
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"json"}));
    
          decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", CSHARP_KEYWORDS);
          decorateSourceMap.put("hashComments", true);
          decorateSourceMap.put("cStyleComments", true);
          decorateSourceMap.put("verbatimStrings", true);
          decorateSourceMap.put("types", C_TYPES);
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"cs"}));
    
          decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", JAVA_KEYWORDS);
          decorateSourceMap.put("cStyleComments", true);
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"java"}));
    
          decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", SH_KEYWORDS);
          decorateSourceMap.put("hashComments", true);
          decorateSourceMap.put("multiLineStrings", true);
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"bash", "bsh", "csh", "sh", "ebuild", "eclass"}));
    
          decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", PYTHON_KEYWORDS);
          decorateSourceMap.put("hashComments", true);
          decorateSourceMap.put("multiLineStrings", true);
          decorateSourceMap.put("tripleQuotedStrings", true);
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"cv", "py", "python"}));
    
          decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", PERL_KEYWORDS);
          decorateSourceMap.put("hashComments", true);
          decorateSourceMap.put("multiLineStrings", true);
          decorateSourceMap.put("regexLiterals", 2);   // multiline regex literals
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"perl", "pl", "pm"}));
    
          decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", RUBY_KEYWORDS);
          decorateSourceMap.put("hashComments", true);
          decorateSourceMap.put("multiLineStrings", true);
          decorateSourceMap.put("regexLiterals", true);
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"rb", "ruby"}));
    
          decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", JSCRIPT_KEYWORDS);
          decorateSourceMap.put("cStyleComments", true);
          decorateSourceMap.put("regexLiterals", true);
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"javascript", "js"}));
    
          decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", COFFEE_KEYWORDS);
          decorateSourceMap.put("hashComments", 3); // ### style block comments
          decorateSourceMap.put("cStyleComments", true);
          decorateSourceMap.put("multilineStrings", true);
          decorateSourceMap.put("tripleQuotedStrings", true);
          decorateSourceMap.put("regexLiterals", true);
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"coffee"}));
    
          decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", RUST_KEYWORDS);
          decorateSourceMap.put("cStyleComments", true);
          decorateSourceMap.put("multilineStrings", true);
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"rc", "rs", "rust"}));
    
          shortcutStylePatterns = new ArrayList<List<Object>>();
          fallthroughStylePatterns = new ArrayList<List<Object>>();
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_STRING, Pattern.compile("^[\\s\\S]+")}));
          registerLangHandler(new CreateSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns), Arrays.asList(new String[]{"regex"}));
    
          /**
           * Registers a language handler for Protocol Buffers as described at
           * http://code.google.com/p/protobuf/.
           *
           * Based on the lexical grammar at
           * http://research.microsoft.com/fsharp/manual/spec2.aspx#_Toc202383715
           *
           * @author mikesamuel@gmail.com
           */
          decorateSourceMap = new HashMap<String, Object>();
          decorateSourceMap.put("keywords", "bytes,default,double,enum,extend,extensions,false,"
                  + "group,import,max,message,option,"
                  + "optional,package,repeated,required,returns,rpc,service,"
                  + "syntax,to,true");
          decorateSourceMap.put("types", Pattern.compile("^(bool|(double|s?fixed|[su]?int)(32|64)|float|string)\\b"));
          decorateSourceMap.put("cStyleComments", true);
          registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"proto"}));
    
          register(LangAppollo.class);
          register(LangBasic.class);
          register(LangClj.class);
          register(LangCss.class);
          register(LangDart.class);
          register(LangErlang.class);
          register(LangGo.class);
          register(LangHs.class);
          register(LangLisp.class);
          register(LangLlvm.class);
          register(LangLua.class);
          register(LangMatlab.class);
          register(LangMl.class);
          register(LangMumps.class);
          register(LangN.class);
          register(LangPascal.class);
          register(LangR.class);
          register(LangRd.class);
          register(LangScala.class);
          register(LangSql.class);
          register(LangTex.class);
          register(LangVb.class);
          register(LangVhdl.class);
          register(LangTcl.class);
          register(LangWiki.class);
          register(LangXq.class);
          register(LangYaml.class);
        } catch (Exception ex) {
          LOG.log(Level.SEVERE, null, ex);
        }
      }
    
      /**
       * Apply the given language handler to sourceCode and add the resulting
       * decorations to out.
       * @param basePos the index of sourceCode within the chunk of source
       *    whose decorations are already present on out.
       */
      protected static void appendDecorations(int basePos, String sourceCode, CreateSimpleLexer langHandler, List<Object> out) {
        if (sourceCode == null) {
          throw new NullPointerException("argument 'sourceCode' cannot be null");
        }
        Job job = new Job();
        job.setSourceCode(sourceCode);
        job.setBasePos(basePos);
        langHandler.decorate(job);
        out.addAll(job.getDecorations());
      }
    
      public class CreateSimpleLexer {
    
        protected List<List<Object>> fallthroughStylePatterns;
        protected Map<Character, List<Object>> shortcuts = new HashMap<Character, List<Object>>();
        protected Pattern tokenizer;
        protected int nPatterns;
    
        /** Given triples of [style, pattern, context] returns a lexing function,
         * The lexing function interprets the patterns to find token boundaries and
         * returns a decoration list of the form
         * [index_0, style_0, index_1, style_1, ..., index_n, style_n]
         * where index_n is an index into the sourceCode, and style_n is a style
         * constant like PR_PLAIN.  index_n-1 <= index_n, and style_n-1 applies to
         * all characters in sourceCode[index_n-1:index_n].
         *
         * The stylePatterns is a list whose elements have the form
         * [style : string, pattern : RegExp, DEPRECATED, shortcut : string].
         *
         * Style is a style constant like PR_PLAIN, or can be a string of the
         * form 'lang-FOO', where FOO is a language extension describing the
         * language of the portion of the token in $1 after pattern executes.
         * E.g., if style is 'lang-lisp', and group 1 contains the text
         * '(hello (world))', then that portion of the token will be passed to the
         * registered lisp handler for formatting.
         * The text before and after group 1 will be restyled using this decorator
         * so decorators should take care that this doesn't result in infinite
         * recursion.  For example, the HTML lexer rule for SCRIPT elements looks
         * something like ['lang-js', /&lt;[s]cript&gt;(.+?)&lt;\/script&gt;/].  This may match
         * '&lt;script&gt;foo()<\/script&gt;', which would cause the current decorator to
         * be called with '&lt;script&gt;' which would not match the same rule since
         * group 1 must not be empty, so it would be instead styled as PR_TAG by
         * the generic tag rule.  The handler registered for the 'js' extension would
         * then be called with 'foo()', and finally, the current decorator would
         * be called with '&lt;\/script&gt;' which would not match the original rule and
         * so the generic tag rule would identify it as a tag.
         *
         * Pattern must only match prefixes, and if it matches a prefix, then that
         * match is considered a token with the same style.
         *
         * Context is applied to the last non-whitespace, non-comment token
         * recognized.
         *
         * Shortcut is an optional string of characters, any of which, if the first
         * character, gurantee that this pattern and only this pattern matches.
         *
         * @param shortcutStylePatterns patterns that always start with
         *   a known character.  Must have a shortcut string.
         * @param fallthroughStylePatterns patterns that will be tried in
         *   order if the shortcut ones fail.  May have shortcuts.
         */
        protected CreateSimpleLexer(List<List<Object>> shortcutStylePatterns, List<List<Object>> fallthroughStylePatterns) throws Exception {
          this.fallthroughStylePatterns = fallthroughStylePatterns;
    
          List<List<Object>> allPatterns = new ArrayList<List<Object>>(shortcutStylePatterns);
          allPatterns.addAll(fallthroughStylePatterns);
          List<Pattern> allRegexs = new ArrayList<Pattern>();
          Map<String, Object> regexKeys = new HashMap<String, Object>();
          for (int i = 0, n = allPatterns.size(); i < n; ++i) {
            List<Object> patternParts = allPatterns.get(i);
            String shortcutChars = patternParts.size() > 3 ? (String) patternParts.get(3) : null;
            if (shortcutChars != null) {
              for (int c = shortcutChars.length(); --c >= 0;) {
                shortcuts.put(shortcutChars.charAt(c), patternParts);
              }
            }
            Pattern regex = (Pattern) patternParts.get(1);
            String k = regex.pattern();
            if (regexKeys.get(k) == null) {
              allRegexs.add(regex);
              regexKeys.put(k, new Object());
            }
          }
          allRegexs.add(Pattern.compile("[\0-\\uffff]"));
          tokenizer = new CombinePrefixPattern().combinePrefixPattern(allRegexs);
    
          nPatterns = fallthroughStylePatterns.size();
        }
    
        /**
         * Lexes job.sourceCode and produces an output array job.decorations of
         * style classes preceded by the position at which they start in
         * job.sourceCode in order.
         *
         * @param job an object like <pre>{
         *    sourceCode: {string} sourceText plain text,
         *    basePos: {int} position of job.sourceCode in the larger chunk of
         *        sourceCode.
         * }</pre>
         */
        public void decorate(Job job) {
          String sourceCode = job.getSourceCode();
          int basePos = job.getBasePos();
          /** Even entries are positions in source in ascending order.  Odd enties
           * are style markers (e.g., PR_COMMENT) that run from that position until
           * the end.
           * @type {Array.<number|string>}
           */
          List<Object> decorations = new ArrayList<Object>(Arrays.asList(new Object[]{basePos, PR_PLAIN}));
          int pos = 0;  // index into sourceCode
          String[] tokens = Util.match(tokenizer, sourceCode, true);
          Map<String, String> styleCache = new HashMap<String, String>();
    
          for (int ti = 0, nTokens = tokens.length; ti < nTokens; ++ti) {
            String token = tokens[ti];
            String style = styleCache.get(token);
            String[] match = null;
    
            boolean isEmbedded;
            if (style != null) {
              isEmbedded = false;
            } else {
              List<Object> patternParts = shortcuts.get(token.charAt(0));
              if (patternParts != null) {
                match = Util.match((Pattern) patternParts.get(1), token, false);
                style = (String) patternParts.get(0);
              } else {
                for (int i = 0; i < nPatterns; ++i) {
                  patternParts = fallthroughStylePatterns.get(i);
                  match = Util.match((Pattern) patternParts.get(1), token, false);
                  if (match.length != 0) {
                    style = (String) patternParts.get(0);
                    break;
                  }
                }
    
                if (match.length == 0) {  // make sure that we make progress
                  style = PR_PLAIN;
                }
              }
    
              isEmbedded = style != null && style.length() >= 5 && style.startsWith("lang-");
              if (isEmbedded && !(match.length > 1 && match[1] != null)) {
                isEmbedded = false;
                style = PR_SOURCE;
              }
    
              if (!isEmbedded) {
                styleCache.put(token, style);
              }
            }
    
            int tokenStart = pos;
            pos += token.length();
    
            if (!isEmbedded) {
              decorations.add(basePos + tokenStart);
              decorations.add(style);
            } else {  // Treat group 1 as an embedded block of source code.
              String embeddedSource = match[1];
              int embeddedSourceStart = token.indexOf(embeddedSource);
              int embeddedSourceEnd = embeddedSourceStart + embeddedSource.length();
              if (match.length > 2 && match[2] != null) {
                // If embeddedSource can be blank, then it would match at the
                // beginning which would cause us to infinitely recurse on the
                // entire token, so we catch the right context in match[2].
                embeddedSourceEnd = token.length() - match[2].length();
                embeddedSourceStart = embeddedSourceEnd - embeddedSource.length();
              }
              String lang = style.substring(5);
              // Decorate the left of the embedded source
              appendDecorations(basePos + tokenStart,
                      token.substring(0, embeddedSourceStart),
                      this, decorations);
              // Decorate the embedded source
              appendDecorations(basePos + tokenStart + embeddedSourceStart,
                      embeddedSource,
                      langHandlerForExtension(lang, embeddedSource),
                      decorations);
              // Decorate the right of the embedded section
              appendDecorations(basePos + tokenStart + embeddedSourceEnd,
                      token.substring(embeddedSourceEnd),
                      this, decorations);
            }
          }
    
          job.setDecorations(Util.removeDuplicates(decorations, job.getSourceCode()));
        }
      }
    
      /** returns a function that produces a list of decorations from source text.
       *
       * This code treats ", ', and ` as string delimiters, and \ as a string
       * escape.  It does not recognize perl's qq() style strings.
       * It has no special handling for double delimiter escapes as in basic, or
       * the tripled delimiters used in python, but should work on those regardless
       * although in those cases a single string literal may be broken up into
       * multiple adjacent string literals.
       *
       * It recognizes C, C++, and shell style comments.
       *
       * @param options a set of optional parameters.
       * @return a function that examines the source code
       *     in the input job and builds the decoration list.
       */
      protected CreateSimpleLexer sourceDecorator(Map<String, Object> options) throws Exception {
        List<List<Object>> shortcutStylePatterns = new ArrayList<List<Object>>();
        List<List<Object>> fallthroughStylePatterns = new ArrayList<List<Object>>();
        if (Util.getVariableValueAsBoolean(options.get("tripleQuotedStrings"))) {
          // '''multi-line-string''', 'single-line-string', and double-quoted
          shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
                    Pattern.compile("^(?:\\'\\'\\'(?:[^\\'\\\\]|\\\\[\\s\\S]|\\'{1,2}(?=[^\\']))*(?:\\'\\'\\'|$)|\\\"\\\"\\\"(?:[^\\\"\\\\]|\\\\[\\s\\S]|\\\"{1,2}(?=[^\\\"]))*(?:\\\"\\\"\\\"|$)|\\'(?:[^\\\\\\']|\\\\[\\s\\S])*(?:\\'|$)|\\\"(?:[^\\\\\\\"]|\\\\[\\s\\S])*(?:\\\"|$))"),
                    null,
                    "'\""}));
        } else if (Util.getVariableValueAsBoolean(options.get("multiLineStrings"))) {
          // 'multi-line-string', "multi-line-string"
          shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
                    Pattern.compile("^(?:\\'(?:[^\\\\\\']|\\\\[\\s\\S])*(?:\\'|$)|\\\"(?:[^\\\\\\\"]|\\\\[\\s\\S])*(?:\\\"|$)|\\`(?:[^\\\\\\`]|\\\\[\\s\\S])*(?:\\`|$))"),
                    null,
                    "'\"`"}));
        } else {
          // 'single-line-string', "single-line-string"
          shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
                    Pattern.compile("^(?:\\'(?:[^\\\\\\'\r\n]|\\\\.)*(?:\\'|$)|\\\"(?:[^\\\\\\\"\r\n]|\\\\.)*(?:\\\"|$))"),
                    null,
                    "\"'"}));
        }
        if (Util.getVariableValueAsBoolean(options.get("verbatimStrings"))) {
          // verbatim-string-literal production from the C# grammar.  See issue 93.
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
                    Pattern.compile("^@\\\"(?:[^\\\"]|\\\"\\\")*(?:\\\"|$)"),
                    null}));
        }
        Object hc = options.get("hashComments");
        if (Util.getVariableValueAsBoolean(hc)) {
          if (Util.getVariableValueAsBoolean(options.get("cStyleComments"))) {
            if ((hc instanceof Integer) && (Integer) hc > 1) {  // multiline hash comments
              shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
                        Pattern.compile("^#(?:##(?:[^#]|#(?!##))*(?:###|$)|.*)"),
                        null,
                        "#"}));
            } else {
              // Stop C preprocessor declarations at an unclosed open comment
              shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
                        Pattern.compile("^#(?:(?:define|e(?:l|nd)if|else|error|ifn?def|include|line|pragma|undef|warning)\\b|[^\r\n]*)"),
                        null,
                        "#"}));
            }
            // #include <stdio.h>
            fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_STRING,
                      Pattern.compile("^<(?:(?:(?:\\.\\.\\/)*|\\/?)(?:[\\w-]+(?:\\/[\\w-]+)+)?[\\w-]+\\.h(?:h|pp|\\+\\+)?|[a-z]\\w*)>"),
                      null}));
          } else {
            shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
                      Pattern.compile("^#[^\r\n]*"),
                      null,
                      "#"}));
          }
        }
        if (Util.getVariableValueAsBoolean(options.get("cStyleComments"))) {
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
                    Pattern.compile("^\\/\\/[^\r\n]*"),
                    null}));
    
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT,
                    Pattern.compile("^\\/\\*[\\s\\S]*?(?:\\*\\/|$)"),
                    null}));
        }
        Object regexLiterals = options.get("regexLiterals");
        if (Util.getVariableValueAsBoolean(regexLiterals)) {
          /**
           * @const
           */
          // Javascript treat true as 1
          String regexExcls = Util.getVariableValueAsInteger(regexLiterals) > 1
                  ? "" // Multiline regex literals
                  : "\n\r";
          /**
           * @const
           */
          String regexAny = !regexExcls.isEmpty() ? "." : "[\\S\\s]";
          /**
           * @const
           */
          String REGEX_LITERAL =
                  // A regular expression literal starts with a slash that is
                  // not followed by * or / so that it is not confused with
                  // comments.
                  "/(?=[^/*" + regexExcls + "])"
                  // and then contains any number of raw characters,
                  + "(?:[^/\\x5B\\x5C" + regexExcls + "]"
                  // escape sequences (\x5C),
                  + "|\\x5C" + regexAny
                  // or non-nesting character sets (\x5B\x5D);
                  + "|\\x5B(?:[^\\x5C\\x5D" + regexExcls + "]"
                  + "|\\x5C" + regexAny + ")*(?:\\x5D|$))+"
                  // finally closed by a /.
                  + "/";
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-regex",
                    Pattern.compile("^" + REGEXP_PRECEDER_PATTERN + "(" + REGEX_LITERAL + ")")}));
        }
    
        Pattern types = (Pattern) options.get("types");
        if (Util.getVariableValueAsBoolean(types)) {
          fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_TYPE, types}));
        }
    
        String keywords = (String) options.get("keywords");
        if (keywords != null) {
          keywords = keywords.replaceAll("^ | $", "");
          if (keywords.length() != 0) {
            fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_KEYWORD,
                      Pattern.compile("^(?:" + keywords.replaceAll("[\\s,]+", "|") + ")\\b"),
                      null}));
          }
        }
    
        shortcutStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN,
                  Pattern.compile("^\\s+"),
                  null,
                  " \r\n\t" + Character.toString((char) 0xA0)
                }));
    
        // TODO(mikesamuel): recognize non-latin letters and numerals in idents
        fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_LITERAL,
                  Pattern.compile("^@[a-z_$][a-z_$@0-9]*", Pattern.CASE_INSENSITIVE),
                  null}));
        fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_TYPE,
                  Pattern.compile("^(?:[@_]?[A-Z]+[a-z][A-Za-z_$@0-9]*|\\w+_t\\b)"),
                  null}));
        fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN,
                  Pattern.compile("^[a-z_$][a-z_$@0-9]*", Pattern.CASE_INSENSITIVE),
                  null}));
        fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_LITERAL,
                  Pattern.compile("^(?:"
                  // A hex number
                  + "0x[a-f0-9]+"
                  // or an octal or decimal number,
                  + "|(?:\\d(?:_\\d+)*\\d*(?:\\.\\d*)?|\\.\\d\\+)"
                  // possibly in scientific notation
                  + "(?:e[+\\-]?\\d+)?"
                  + ')'
                  // with an optional modifier like UL for unsigned long
                  + "[a-z]*", Pattern.CASE_INSENSITIVE),
                  null,
                  "0123456789"}));
        // Don't treat escaped quotes in bash as starting strings.
        // See issue 144.
        fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN,
                  Pattern.compile("^\\\\[\\s\\S]?"),
                  null}));
    
        // The Bash man page says
    
        // A word is a sequence of characters considered as a single
        // unit by GRUB. Words are separated by metacharacters,
        // which are the following plus space, tab, and newline: { }
        // | & $ ; < >
        // ...
    
        // A word beginning with # causes that word and all remaining
        // characters on that line to be ignored.
    
        // which means that only a '#' after /(?:^|[{}|&$;<>\s])/ starts a
        // comment but empirically
        // $ echo {#}
        // {#}
        // $ echo \$#
        // $#
        // $ echo }#
        // }#
    
        // so /(?:^|[|&;<>\s])/ is more appropriate.
    
        // http://gcc.gnu.org/onlinedocs/gcc-2.95.3/cpp_1.html#SEC3
        // suggests that this definition is compatible with a
        // default mode that tries to use a single token definition
        // to recognize both bash/python style comments and C
        // preprocessor directives.
    
        // This definition of punctuation does not include # in the list of
        // follow-on exclusions, so # will not be broken before if preceeded
        // by a punctuation character.  We could try to exclude # after
        // [|&;<>] but that doesn't seem to cause many major problems.
        // If that does turn out to be a problem, we should change the below
        // when hc is truthy to include # in the run of punctuation characters
        // only when not followint [|&;<>].
        String punctuation = "^.[^\\s\\w.$@'\"`/\\\\]*";
        if (Util.getVariableValueAsBoolean(options.get("regexLiterals"))) {
            punctuation += "(?!\\s*/)";
        }
        fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PUNCTUATION,
                  Pattern.compile(punctuation),
                  null}));
    
        return new CreateSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns);
      }
      /** Maps language-specific file extensions to handlers. */
      protected Map<String, Object> langHandlerRegistry = new HashMap<String, Object>();
    
      /** Register a language handler for the given file extensions.
       * @param handler a function from source code to a list
       *      of decorations.  Takes a single argument job which describes the
       *      state of the computation.   The single parameter has the form
       *      {@code {
       *        sourceCode: {string} as plain text.
       *        decorations: {Array.<number|string>} an array of style classes
       *                     preceded by the position at which they start in
       *                     job.sourceCode in order.
       *                     The language handler should assigned this field.
       *        basePos: {int} the position of source in the larger source chunk.
       *                 All positions in the output decorations array are relative
       *                 to the larger source chunk.
       *      } }
       * @param fileExtensions
       */
      protected void registerLangHandler(CreateSimpleLexer handler, List<String> fileExtensions) throws Exception {
        for (int i = fileExtensions.size(); --i >= 0;) {
          String ext = fileExtensions.get(i);
          if (langHandlerRegistry.get(ext) == null) {
            langHandlerRegistry.put(ext, handler);
          } else {
            throw new Exception("cannot override language handler " + ext);
          }
        }
      }
    
      /**
       * Register language handler. The clazz will not be instantiated
       * @param clazz the class of the language
       * @throws Exception cannot instantiate the object using the class,
       * or language handler with specified extension exist already
       */
      public void register(Class<? extends Lang> clazz) throws Exception {
        if (clazz == null) {
          throw new NullPointerException("argument 'clazz' cannot be null");
        }
        List<String> fileExtensions = getFileExtensionsFromClass(clazz);
        for (int i = fileExtensions.size(); --i >= 0;) {
          String ext = fileExtensions.get(i);
          if (langHandlerRegistry.get(ext) == null) {
            langHandlerRegistry.put(ext, clazz);
          } else {
            throw new Exception("cannot override language handler " + ext);
          }
        }
      }
    
      protected List<String> getFileExtensionsFromClass(Class<? extends Lang> clazz) throws Exception {
        Method getExtensionsMethod = clazz.getMethod("getFileExtensions", (Class<?>[]) null);
        return (List<String>) getExtensionsMethod.invoke(null, new Object[]{});
      }
    
      /**
       * Get the parser for the extension specified. 
       * @param extension the file extension, if null, default parser will be returned
       * @param source the source code
       * @return the parser
       */
      public CreateSimpleLexer langHandlerForExtension(String extension, String source) {
        if (!(extension != null && langHandlerRegistry.get(extension) != null)) {
          // Treat it as markup if the first non whitespace character is a < and
          // the last non-whitespace character is a >.
          extension = Util.test(Pattern.compile("^\\s*<"), source)
                  ? "default-markup"
                  : "default-code";
        }
    
        Object handler = langHandlerRegistry.get(extension);
        if (handler instanceof CreateSimpleLexer) {
          return (CreateSimpleLexer) handler;
        } else {
          CreateSimpleLexer _simpleLexer;
          try {
            Lang _lang = ((Class<Lang>) handler).newInstance();
            _simpleLexer = new CreateSimpleLexer(_lang.getShortcutStylePatterns(), _lang.getFallthroughStylePatterns());
    
            List<Lang> extendedLangs = _lang.getExtendedLangs();
            for (Lang _extendedLang : extendedLangs) {
              register(_extendedLang.getClass());
            }
    
            List<String> fileExtensions = getFileExtensionsFromClass((Class<Lang>) handler);
            for (String _extension : fileExtensions) {
              langHandlerRegistry.put(_extension, _simpleLexer);
            }
          } catch (Exception ex) {
            LOG.log(Level.SEVERE, null, ex);
            return null;
          }
    
          return _simpleLexer;
        }
      }
    }
    </code></pre>    <br/>
        <br/>
    <div class='clear'></div>
    </main>
    </div>
    <br/><br/>
        <div class="align-center">&copy; 2015 - 2025 <a href="/legal-notice.php">Weber Informatics LLC</a>&nbsp;|&nbsp;<a href="/data-protection.php">Privacy Policy</a></div>
    <br/><br/><br/><br/><br/><br/>
    </body>
    </html>