prettify.parser.Prettify Maven / Gradle / Ivy
Show all versions of syntaxhighlighterfx Show documentation
// Copyright (C) 2006 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package prettify.parser;
import prettify.lang.*;
import java.lang.reflect.Method;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
/**
* This is similar to the syntaxhighlighterfx.js in JavaScript Prettify.
*
* All comments are adapted from the JavaScript Prettify.
*
*
* Some functions for browser-side pretty printing of code contained in html.
*
*
*
* For a fairly comprehensive set of languages see the
* README
* file that came with this source. At a minimum, the lexer should work on a
* number of languages including C and friends, Java, Python, Bash, SQL, HTML,
* XML, CSS, Javascript, and Makefiles. It works passably on Ruby, PHP and Awk
* and a subset of Perl, but, because of commenting conventions, doesn't work on
* Smalltalk, Lisp-like, or CAML-like languages without an explicit lang class.
*
* Usage:
* - include this source file in an html page via
* {@code }
*
- define style rules. See the example page for examples.
*
- mark the {@code
} and {@code } tags in your source with
* {@code class=prettyprint.}
* You can also use the (html deprecated) {@code } tag, but the pretty
* printer needs to do more substantial DOM manipulations to support that, so
* some css styles may not be preserved.
*
* That's it. I wanted to keep the API as simple as possible, so there's no
* need to specify which language the code is in, but if you wish, you can add
* another class to the {@code } or {@code } element to specify the
* language, as in {@code }. Any class that
* starts with "lang-" followed by a file extension, specifies the file type.
* See the "lang-*.js" files in this directory for code that implements
* per-language file handlers.
*
* Change log:
* cbeust, 2006/08/22
*
* Java annotations (start with "@") are now captured as literals ("lit")
*
*/
public class Prettify {
private static final Logger LOG = Logger.getLogger(Prettify.class.getName());
// Keyword lists for various languages.
public static final String FLOW_CONTROL_KEYWORDS = "break,continue,do,else,for,if,return,while";
public static final String C_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "auto,case,char,const,default,"
+ "double,enum,extern,float,goto,inline,int,long,register,short,signed,"
+ "sizeof,static,struct,switch,typedef,union,unsigned,void,volatile";
public static final String COMMON_KEYWORDS = C_KEYWORDS + "," + "catch,class,delete,false,import,"
+ "new,operator,private,protected,public,this,throw,true,try,typeof";
public static final String CPP_KEYWORDS = COMMON_KEYWORDS + "," + "alignof,align_union,asm,axiom,bool,"
+ "concept,concept_map,const_cast,constexpr,decltype,delegate,"
+ "dynamic_cast,explicit,export,friend,generic,late_check,"
+ "mutable,namespace,nullptr,property,reinterpret_cast,static_assert,"
+ "static_cast,template,typeid,typename,using,virtual,where";
public static final String JAVA_KEYWORDS = COMMON_KEYWORDS + ","
+ "abstract,assert,boolean,byte,extends,final,finally,implements,import,"
+ "instanceof,interface,null,native,package,strictfp,super,synchronized,"
+ "throws,transient";
public static final String RUST_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "as,assert,const,copy,drop,"
+ "enum,extern,fail,false,fn,impl,let,log,loop,match,mod,move,mut,priv,"
+ "pub,pure,ref,self,static,struct,true,trait,type,unsafe,use";
public static final String CSHARP_KEYWORDS = JAVA_KEYWORDS + ","
+ "as,base,by,checked,decimal,delegate,descending,dynamic,event,"
+ "fixed,foreach,from,group,implicit,in,internal,into,is,let,"
+ "lock,object,out,override,orderby,params,partial,readonly,ref,sbyte,"
+ "sealed,stackalloc,string,select,uint,ulong,unchecked,unsafe,ushort,"
+ "var,virtual,where";
public static final String COFFEE_KEYWORDS = "all,and,by,catch,class,else,extends,false,finally,"
+ "for,if,in,is,isnt,loop,new,no,not,null,of,off,on,or,return,super,then,"
+ "throw,true,try,unless,until,when,while,yes";
public static final String JSCRIPT_KEYWORDS = COMMON_KEYWORDS + ","
+ "debugger,eval,export,function,get,null,set,undefined,var,with,"
+ "Infinity,NaN";
public static final String PERL_KEYWORDS = "caller,delete,die,do,dump,elsif,eval,exit,foreach,for,"
+ "goto,if,import,last,local,my,next,no,our,print,package,redo,require,"
+ "sub,undef,unless,until,use,wantarray,while,BEGIN,END";
public static final String PYTHON_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "and,as,assert,class,def,del,"
+ "elif,except,exec,finally,from,global,import,in,is,lambda,"
+ "nonlocal,not,or,pass,print,raise,try,with,yield,"
+ "False,True,None";
public static final String RUBY_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "alias,and,begin,case,class,"
+ "def,defined,elsif,end,ensure,false,in,module,next,nil,not,or,redo,"
+ "rescue,retry,self,super,then,true,undef,unless,until,when,yield,"
+ "BEGIN,END";
public static final String SH_KEYWORDS = FLOW_CONTROL_KEYWORDS + "," + "case,done,elif,esac,eval,fi,"
+ "function,in,local,set,then,until";
public static final String ALL_KEYWORDS = CPP_KEYWORDS + "," + CSHARP_KEYWORDS + "," + JSCRIPT_KEYWORDS + "," + PERL_KEYWORDS + ","
+ PYTHON_KEYWORDS + "," + RUBY_KEYWORDS + "," + SH_KEYWORDS;
public static final Pattern C_TYPES = Pattern.compile("^(DIR|FILE|vector|(de|priority_)?queue|list|stack|(const_)?iterator|(multi)?(set|map)|bitset|u?(int|float)\\d*)\\b");
// token style names. correspond to css classes
/**
* token style for a string literal
*/
public static final String PR_STRING = "str";
/**
* token style for a keyword
*/
public static final String PR_KEYWORD = "kwd";
/**
* token style for a comment
*/
public static final String PR_COMMENT = "com";
/**
* token style for a type
*/
public static final String PR_TYPE = "typ";
/**
* token style for a literal value. e.g. 1, null, true.
*/
public static final String PR_LITERAL = "lit";
/**
* token style for a punctuation string.
*/
public static final String PR_PUNCTUATION = "pun";
/**
* token style for a plain text.
*/
public static final String PR_PLAIN = "pln";
/**
* token style for an sgml tag.
*/
public static final String PR_TAG = "tag";
/**
* token style for a markup declaration such as a DOCTYPE.
*/
public static final String PR_DECLARATION = "dec";
/**
* token style for embedded source.
*/
public static final String PR_SOURCE = "src";
/**
* token style for an sgml attribute name.
*/
public static final String PR_ATTRIB_NAME = "atn";
/**
* token style for an sgml attribute value.
*/
public static final String PR_ATTRIB_VALUE = "atv";
/**
* A class that indicates a section of markup that is not code, e.g. to allow
* embedding of line numbers within code listings.
*/
public static final String PR_NOCODE = "nocode";
/**
* A set of tokens that can precede a regular expression literal in
* javascript
* http://web.archive.org/web/20070717142515/http://www.mozilla.org/js/language/js20/rationale/syntax.html
* has the full list, but I've removed ones that might be problematic when
* seen in languages that don't support regular expression literals.
*
* Specifically, I've removed any keywords that can't precede a regexp
* literal in a syntactically legal javascript program, and I've removed the
* "in" keyword since it's not a keyword in many languages, and might be used
* as a count of inches.
*
*
The link above does not accurately describe EcmaScript rules since
* it fails to distinguish between (a=++/b/i) and (a++/b/i) but it works
* very well in practice.
*/
private static final String REGEXP_PRECEDER_PATTERN = "(?:^^\\.?|[+-]|[!=]=?=?|\\#|%=?|&&?=?|\\(|\\*=?|[+\\-]=|->|\\/=?|::?|<=?|>>?>?=?|,|;|\\?|@|\\[|~|\\{|\\^\\^?=?|\\|\\|?=?|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*";
// CAVEAT: this does not properly handle the case where a regular
// expression immediately follows another since a regular expression may
// have flags for case-sensitivity and the like. Having regexp tokens
// adjacent is not valid in any language I'm aware of, so I'm punting.
// TODO: maybe style special characters inside a regexp as punctuation.
public Prettify() {
try {
Map decorateSourceMap = new HashMap();
decorateSourceMap.put("keywords", ALL_KEYWORDS);
decorateSourceMap.put("hashComments", true);
decorateSourceMap.put("cStyleComments", true);
decorateSourceMap.put("multiLineStrings", true);
decorateSourceMap.put("regexLiterals", true);
registerLangHandler(sourceDecorator(decorateSourceMap), Arrays.asList(new String[]{"default-code"}));
List> shortcutStylePatterns, fallthroughStylePatterns;
shortcutStylePatterns = new ArrayList>();
fallthroughStylePatterns = new ArrayList>();
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PLAIN, Pattern.compile("^[^]+")}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_DECLARATION, Pattern.compile("^]*(?:>|$)")}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_COMMENT, Pattern.compile("^<\\!--[\\s\\S]*?(?:-\\->|$)")}));
// Unescaped content in an unknown language
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-", Pattern.compile("^<\\?([\\s\\S]+?)(?:\\?>|$)")}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-", Pattern.compile("^<%([\\s\\S]+?)(?:%>|$)")}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{PR_PUNCTUATION, Pattern.compile("^(?:<[%?]|[%?]>)")}));
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-", Pattern.compile("^]*>([\\s\\S]+?)<\\/xmp\\b[^>]*>", Pattern.CASE_INSENSITIVE)}));
// Unescaped content in javascript. (Or possibly vbscript).
fallthroughStylePatterns.add(Arrays.asList(new Object[]{"lang-js", Pattern.compile("^