org.gjt.sp.jedit.syntax.TokenMarker Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sourcetohtml
This project aims to build a command line tool that can create HTML view with syntax highlighted source code. It uses Jedit syntax highlighting engine and support all languages that are supported in JEdit. Which are currently: ActionScript, Ada 95, ANTLR, Apache HTTPD, APDL, AppleScript, ASP, Aspect-J, Assembly, AWK, B formal method, Batch, BBj, BCEL, BibTeX, C, C++, C#, CHILL, CIL, COBOL, ColdFusion, CSS, CVS Commit, D, DOxygen, DSSSL, Eiffel, EmbPerl, Erlang, Factor, Fortran, Foxpro, FreeMarker, Fortran, Gettext, Groovy, Haskell, HTML, Icon, IDL, Inform, INI, Inno Setup, Informix 4GL, Interlis, Io, Java, JavaScript, JCL, JHTML, JMK, JSP, Latex, Lilypond, Lisp, LOTOS, Lua, Makefile, Maple, ML, Modula-3, MoinMoin, MQSC, NetRexx, NQC, NSIS2, Objective C, ObjectRexx, Occam, Omnimark, Parrot, Pascal, Patch, Perl, PHP, Pike, PL-SQL, PL/I, Pop11, PostScript, Povray, PowerDynamo, Progress 4GL, Prolog, Properties, PSP, PV-WAVE, Pyrex, Python, REBOL, Redcode, Relax-NG, RelationalView, Rest, Rib, RPM spec, RTF, Ruby, Ruby-HTML, RView, S+, S#, SAS, Scheme, SDL/PL, SGML, Shell Script, SHTML, Smalltalk, SMI MIB, SQR, Squidconf, SVN Commit, Swig, TCL, TeX, Texinfo, TPL, Transact-SQL, UnrealScript, VBScript, Velocity, Verilog, VHDL, XML, XSL, ZPT
The newest version!
/*
 * TokenMarker.java - Tokenizes lines of text
 * :tabSize=8:indentSize=8:noTabs=false:
 * :folding=explicit:collapseFolds=1:
 *
 * Copyright (C) 1998, 2003 Slava Pestov
 * Copyright (C) 1999, 2000 mike dillon
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

package org.gjt.sp.jedit.syntax;

//{{{ Imports
import javax.swing.text.Segment;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.gjt.sp.jedit.TextUtilities;
import org.gjt.sp.util.SegmentCharSequence;
import org.gjt.sp.util.StandardUtilities;
//}}}

/**
 * A token marker splits lines of text into tokens. Each token carries
 * a length field and an identification tag that can be mapped to a color
 * or font style for painting that token.
 *
 * @author Slava Pestov, mike dillon
 * @version $Id: TokenMarker.java 12504 2008-04-22 23:12:43Z ezust $
 *
 * @see org.gjt.sp.jedit.syntax.Token
 * @see org.gjt.sp.jedit.syntax.TokenHandler
 */
public class TokenMarker
{
	//{{{ TokenMarker constructor
	public TokenMarker()
	{} //}}}

	//{{{ addRuleSet() method
	public void addRuleSet(ParserRuleSet rules)
	{
		ruleSets.put(rules.getSetName(), rules);

		if (rules.getSetName().equals("MAIN"))
			mainRuleSet = rules;
	} //}}}

	//{{{ getMainRuleSet() method
	public ParserRuleSet getMainRuleSet()
	{
		return mainRuleSet;
	} //}}}

	//{{{ getRuleSet() method
	public ParserRuleSet getRuleSet(String setName)
	{
		return ruleSets.get(setName);
	} //}}}

	//{{{ getRuleSets() method
	/**
	 * @since jEdit 4.2pre3
	 */
	public ParserRuleSet[] getRuleSets()
	{
		return ruleSets.values().toArray(new ParserRuleSet[ruleSets.size()]);
	} //}}}

	//{{{ markTokens() method
	/**
	 * Do not call this method directly; call Buffer.markTokens() instead.
	 *
	 * @param prevContext the context of the previous line, it can be null
	 * @param tokenHandler the token handler
	 * @param line a segment containing the content of the line
	 */
	public LineContext markTokens(LineContext prevContext,
		TokenHandler tokenHandler, Segment line)
	{
		//{{{ Set up some instance variables
		// this is to avoid having to pass around lots and lots of
		// parameters.
		this.tokenHandler = tokenHandler;
		this.line = line;

		lastOffset = line.offset;
		lineLength = line.count + line.offset;

		context = new LineContext();

		if(prevContext == null)
		{
			context.rules = getMainRuleSet();
			context.escapeRule = context.rules.getEscapeRule();
		}
		else
		{
			context.parent = prevContext.parent;
			context.setInRule(prevContext.inRule);
			context.rules = prevContext.rules;
			context.spanEndSubst = prevContext.spanEndSubst;
		}

		keywords = context.rules.getKeywords();

		seenWhitespaceEnd = false;
		whitespaceEnd = line.offset;
		//}}}

		//{{{ Main parser loop
		int terminateChar = context.rules.getTerminateChar();
		boolean terminated = false;
main_loop:	for(pos = line.offset; pos < lineLength; pos++)
		{
			//{{{ check if we have to stop parsing (happens if the terminateChar has been exceeded)
			if(terminateChar >= 0 && pos - line.offset >= terminateChar
				&& !terminated)
			{
				terminated = true;
				context = new LineContext(ParserRuleSet
					.getStandardRuleSet(context.rules
					.getDefault()),context);
				keywords = context.rules.getKeywords();
			} //}}}

			//{{{ Check for the escape rule before anything else.
			if (context.escapeRule != null &&
				handleRule(context.escapeRule,false))
			{
				continue main_loop;
			} //}}}

			//{{{ check for end of delegate
			if (context.parent != null
			    && context.parent.inRule != null
			    && checkDelegateEnd(context.parent.inRule))
			{
				seenWhitespaceEnd = true;
				continue main_loop;
			} //}}}

			//{{{ check every rule
			Character ch = Character.valueOf(line.array[pos]);
			List rules = context.rules.getRules(ch);
			for (ParserRule rule : rules)
			{
				// stop checking rules if there was a match
				if (handleRule(rule,false))
				{
					seenWhitespaceEnd = true;
					continue main_loop;
				}
			} //}}}

			//{{{ check if current character is a word separator
			if(Character.isWhitespace(ch))
			{
				if(!seenWhitespaceEnd)
					whitespaceEnd = pos + 1;

				if(context.inRule != null)
					handleRule(context.inRule,true);

				handleNoWordBreak();

				markKeyword(false);

				if(lastOffset != pos)
				{
					tokenHandler.handleToken(line,
						context.rules.getDefault(),
						lastOffset - line.offset,
						pos - lastOffset,
						context);
				}

				tokenHandler.handleToken(line,
					context.rules.getDefault(),
					pos - line.offset,1,context);
				lastOffset = pos + 1;
			}
			else
			{
				if(keywords != null || context.rules.getRuleCount() != 0)
				{
					String noWordSep = context.rules.getNoWordSep();

					if(!Character.isLetterOrDigit(ch)
						&& noWordSep.indexOf(ch) == -1)
					{
						if(context.inRule != null)
							handleRule(context.inRule,true);

						handleNoWordBreak();

						markKeyword(true);

						tokenHandler.handleToken(line,
							context.rules.getDefault(),
							lastOffset - line.offset,1,
							context);
						lastOffset = pos + 1;
					}
				}

				seenWhitespaceEnd = true;
			} //}}}
		} //}}}

		//{{{ Mark all remaining characters
		pos = lineLength;

		if(context.inRule != null)
			handleRule(context.inRule,true);

		handleNoWordBreak();
		markKeyword(true);
		//}}}

		//{{{ Unwind any NO_LINE_BREAK parent delegates
unwind:		while(context.parent != null)
		{
			ParserRule rule = context.parent.inRule;
			if((rule != null && (rule.action
				& ParserRule.NO_LINE_BREAK) == ParserRule.NO_LINE_BREAK)
				|| terminated)
			{
				context = context.parent;
				keywords = context.rules.getKeywords();
				context.setInRule(null);
			}
			else
				break unwind;
		} //}}}

		tokenHandler.handleToken(line,Token.END,
			pos - line.offset,0,context);

		context = context.intern();
		tokenHandler.setLineContext(context);

		/* for GC. */
		this.line = null;

		return context;
	} //}}}

	//{{{ Private members

	//{{{ Instance variables
	private final Map ruleSets = new Hashtable(64);
	private ParserRuleSet mainRuleSet;

	// Instead of passing these around to each method, we just store them
	// as instance variables. Note that this is not thread-safe.
	private TokenHandler tokenHandler;
	/** The line from which we will mark the tokens. */
	private Segment line;
	/** The context of the current line. */
	private LineContext context;
	private KeywordMap keywords;
	private final Segment pattern = new Segment();
	private int lastOffset;
	private int lineLength;
	private int pos;

	private int whitespaceEnd;
	private boolean seenWhitespaceEnd;
	//}}}

	//{{{ checkDelegateEnd() method
	private boolean checkDelegateEnd(ParserRule rule)
	{
		if(rule.end == null)
			return false;

		LineContext tempContext = context;
		context = context.parent;
		keywords = context.rules.getKeywords();
		boolean handled = handleRule(rule,true);
		context = tempContext;
		keywords = context.rules.getKeywords();

		if (handled)
		{
			if(context.inRule != null)
				handleRule(context.inRule,true);

			markKeyword(true);

			context = (LineContext)context.parent.clone();

			tokenHandler.handleToken(line,
				matchToken(context.inRule, context.inRule, context),
				pos - line.offset,pattern.count,context);

			keywords = context.rules.getKeywords();
			context.setInRule(null);
			lastOffset = pos + pattern.count;

			// move pos to last character of match sequence
			pos += pattern.count - 1;

			return true;
		}

		return false;
	} //}}}

	//{{{ handleRule() method
	/**
	 * Checks if the rule matches the line at the current position
	 * and handles the rule if it does match
	 */
	private boolean handleRule(ParserRule checkRule, boolean end)
	{
		//{{{ Some rules can only match in certain locations
		if(!end)
		{
			if (null == checkRule.upHashChars)
			{
				if (checkRule.upHashChar != null &&
				    (pos + checkRule.upHashChar.length() < line.array.length) &&
				    !checkHashString(checkRule))
				{
					return false;
				}
			}
			else
			{
				if (-1 == Arrays.binarySearch(
						checkRule.upHashChars,
						Character.toUpperCase(line.array[pos])))
				{
					return false;
				}
			}
		}

		int offset = (checkRule.action & ParserRule.MARK_PREVIOUS) != 0 ? lastOffset : pos;
		int posMatch = end ? checkRule.endPosMatch : checkRule.startPosMatch;

		if((posMatch & ParserRule.AT_LINE_START)
			== ParserRule.AT_LINE_START)
		{
			if(offset != line.offset)
			{
				return false;
			}
		}
		else if((posMatch & ParserRule.AT_WHITESPACE_END)
			== ParserRule.AT_WHITESPACE_END)
		{
			if(offset != whitespaceEnd)
			{
				return false;
			}
		}
		else if((posMatch & ParserRule.AT_WORD_START)
			== ParserRule.AT_WORD_START)
		{
			if(offset != lastOffset)
			{
				return false;
			}
		} //}}}

		int matchedChars = 1;
		CharSequence charSeq = null;
		Matcher match = null;

		//{{{ See if the rule's start or end sequence matches here
		if(!end || (checkRule.action & ParserRule.MARK_FOLLOWING) == 0)
		{
			// the end cannot be a regular expression
			if((checkRule.action & ParserRule.REGEXP) == 0 || end)
			{
				if(end)
				{
					if(context.spanEndSubst != null)
						pattern.array = context.spanEndSubst;
					else
						pattern.array = checkRule.end;
				}
				else
					pattern.array = checkRule.start;
				pattern.offset = 0;
				pattern.count = pattern.array.length;
				matchedChars = pattern.count;

				if(!SyntaxUtilities.regionMatches(context.rules
					.getIgnoreCase(),line,pos,pattern.array))
				{
					return false;
				}
			}
			else
			{
				// note that all regexps start with \A so they only
				// match the start of the string
				//int matchStart = pos - line.offset;
				charSeq = new SegmentCharSequence(line, pos - line.offset,
								  line.count - (pos - line.offset));
				match = checkRule.startRegexp.matcher(charSeq);
				if(!match.lookingAt())
				{
					return false;
				}
				else if(match.start() != 0)
				{
					throw new InternalError("Can't happen");
				}
				else
				{
					matchedChars = match.end();
					/* workaround for hang if match was
					 * zero-width. not sure if there is
					 * a better way to handle this */
					if(matchedChars == 0)
						matchedChars = 1;
				}
			}
		} //}}}
		//{{{ Check for an escape sequence
		if((checkRule.action & ParserRule.IS_ESCAPE) == ParserRule.IS_ESCAPE)
		{
			pos += pattern.count;
		} //}}}
		//{{{ Handle start of rule
		else if(!end)
		{
			if(context.inRule != null)
				handleRule(context.inRule,true);

			markKeyword((checkRule.action & ParserRule.MARK_PREVIOUS)
				!= ParserRule.MARK_PREVIOUS);

			switch(checkRule.action & ParserRule.MAJOR_ACTIONS)
			{
			//{{{ SEQ
			case ParserRule.SEQ:
				context.spanEndSubst = null;

				if((checkRule.action & ParserRule.REGEXP) != 0)
				{
					handleTokenWithSpaces(tokenHandler,
						checkRule.token,
						pos - line.offset,
						matchedChars,
						context);
				}
				else
				{
					tokenHandler.handleToken(line,
						checkRule.token,
						pos - line.offset,
						matchedChars,context);
				}

				// a DELEGATE attribute on a SEQ changes the
				// ruleset from the end of the SEQ onwards
				if(checkRule.delegate != null)
				{
					context = new LineContext(
						checkRule.delegate,
						context.parent);
					keywords = context.rules.getKeywords();
				}
				break;
			//}}}
			//{{{ SPAN, EOL_SPAN
			case ParserRule.SPAN:
			case ParserRule.EOL_SPAN:
				context.setInRule(checkRule);

				byte tokenType = matchToken(checkRule,
							context.inRule, context);

				if((checkRule.action & ParserRule.REGEXP) != 0)
				{
					handleTokenWithSpaces(tokenHandler,
						tokenType,
						pos - line.offset,
						matchedChars,
						context);
				}
				else
				{
					tokenHandler.handleToken(line,tokenType,
						pos - line.offset,
						matchedChars,context);
				}

				char[] spanEndSubst = null;
				/* substitute result of matching the rule start
				 * into the end string.
				 *
				 * eg, in shell script mode, <<\s*(\w+) is
				 * matched into \<$1\> to construct rules for
				 * highlighting read-ins like this < intern = new HashMap();

		public LineContext parent;
		public ParserRule inRule;
		public ParserRuleSet rules;
		// used for SPAN_REGEXP rules; otherwise null
		public char[] spanEndSubst;
		public ParserRule escapeRule;

		//{{{ LineContext constructor
		public LineContext(ParserRuleSet rs, LineContext lc)
		{
			rules = rs;
			parent = (lc == null ? null : (LineContext)lc.clone());
			/*
			 * SPANs with no delegate need to propagate the
			 * escape rule to the child context, so this is
			 * needed.
			 */
			if (rs.getModeName() != null)
				escapeRule = rules.getEscapeRule();
			else
				escapeRule = lc.escapeRule;
		} //}}}

		//{{{ LineContext constructor
		public LineContext()
		{
		} //}}}

		//{{{ intern() method
		public LineContext intern()
		{
			LineContext obj = intern.get(this);
			if(obj == null)
			{
				intern.put(this,this);
				return this;
			}
			else
				return obj;
		} //}}}

		//{{{ hashCode() method
		public int hashCode()
		{
			if(inRule != null)
				return inRule.hashCode();
			else if(rules != null)
				return rules.hashCode();
			else
				return 0;
		} //}}}

		//{{{ equals() method
		public boolean equals(Object obj)
		{
			if(obj instanceof LineContext)
			{
				LineContext lc = (LineContext)obj;
				return lc.inRule == inRule && lc.rules == rules
					&& StandardUtilities.objectsEqual(parent,lc.parent)
					&& charArraysEqual(spanEndSubst,lc.spanEndSubst);
			}
			else
				return false;
		} //}}}

		//{{{ clone() method
		public Object clone()
		{
			LineContext lc = new LineContext();
			lc.inRule = inRule;
			lc.rules = rules;
			lc.parent = (parent == null) ? null : (LineContext) parent.clone();
			lc.spanEndSubst = spanEndSubst;
			lc.escapeRule = escapeRule;

			return lc;
		} //}}}

		//{{{ charArraysEqual() method
		private static boolean charArraysEqual(char[] c1, char[] c2)
		{
			if(c1 == null)
				return c2 == null;

			// c1 is not null
			if(c2 == null)
				return false;

			if(c1.length != c2.length)
				return false;

			for(int i = 0; i < c1.length; i++)
			{
				if(c1[i] != c2[i])
					return false;
			}

			return true;
		} //}}}

		//{{{ setInRule() method
		/**
		 * Sets the current rule being processed and adjusts the
		 * escape rule for the context based on the rule.
		 */
		public void setInRule(ParserRule rule)
		{
			inRule = rule;
			if (rule != null && rule.escapeRule != null)
				escapeRule = rule.escapeRule;
			else if (rules != null && rules.getName() != null)
				escapeRule = rules.getEscapeRule();
			else if (parent != null)
				escapeRule = parent.escapeRule;
			else
				escapeRule = null;
		} //}}}

	} //}}}
}